/***************************************************************************/
/*! 
    \file        aes.c

    \brief       AES implementation (FIPS-197 compliant)

    \author      Marko Wolf (mwolf@crypto.rub.de),
                 Andr Weimerskirch (weika@crypto.rub.de)
    
    \version     2.3.7
    
    \date        August 27, 2004
    
    \warning     For 32-bit only!
   
    \par Reviews:
    \li          25.08.2004 Splint 3.1.1 (no warnings at all options)

    \par Modifications:
	\li          18.07.2006 ESN-Listle  modified to work in BPCL

*/
/***************************************************************************/

#include "bpcl.h"
#include "bpcl_int.h"
tU8 global_key_mask0[] = { 0xcb, 0x04, 0xec, 0x2d, 0x6b, 0xe3, 0x53, 0x2d, 0x12, 0x3b, };

typedef struct {
    tU32	enc_round_keys[64]; /* encryption round keys         */
    tU32	dec_round_keys[64]; /* decryption round keys         */
    tU8		IV[16];             /* 128-bit initialization vector */    
    int		number_rounds;		/* number of rounds              */
} aes_context;

#if ( BPCL_AES_FEAT_DO_MASKING == 0 )

// AES forward round (unmasked)
#define AES_FORWARD_ROUND(column_out_0,column_out_1,column_out_2,column_out_3,column_in_0,column_in_1,column_in_2,column_in_3) \
{                                                                                                   \
  round_key += 4;                                                                                   \
                                                                                                    \
  /* ShiftRows() via input structure (Sec.5.1.1 Fig.6) */                                           \
  column_out_0 = forward_T_box_0[ (int)( (tU8) ( column_in_0 >> 24 ) ) ] /* 0,0 */ ^               \
                 forward_T_box_1[ (int)( (tU8) ( column_in_1 >> 16 ) ) ] /* 1,1 */ ^               \
                 forward_T_box_2[ (int)( (tU8) ( column_in_2 >>  8 ) ) ] /* 2,2 */ ^               \
                 forward_T_box_3[ (int)( (tU8) ( column_in_3       ) ) ] /* 3,3 */ ^ round_key[0]; \
                                                                                                    \
  column_out_1 = forward_T_box_0[ (int)( (tU8) ( column_in_1 >> 24 ) ) ] /* 0,1 */ ^               \
                 forward_T_box_1[ (int)( (tU8) ( column_in_2 >> 16 ) ) ] /* 1,2 */ ^               \
                 forward_T_box_2[ (int)( (tU8) ( column_in_3 >>  8 ) ) ] /* 2,3 */ ^               \
                 forward_T_box_3[ (int)( (tU8) ( column_in_0       ) ) ] /* 3,0 */ ^ round_key[1]; \
                                                                                                    \
  column_out_2 = forward_T_box_0[ (int)( (tU8) ( column_in_2 >> 24 ) ) ] /* 0,2 */ ^               \
                 forward_T_box_1[ (int)( (tU8) ( column_in_3 >> 16 ) ) ] /* 1,3 */ ^               \
                 forward_T_box_2[ (int)( (tU8) ( column_in_0 >>  8 ) ) ] /* 2,0 */ ^               \
                 forward_T_box_3[ (int)( (tU8) ( column_in_1       ) ) ] /* 3,1 */ ^ round_key[2]; \
                                                                                                    \
  column_out_3 = forward_T_box_0[ (int)( (tU8) ( column_in_3 >> 24 ) ) ] /* 0,3 */ ^               \
                 forward_T_box_1[ (int)( (tU8) ( column_in_0 >> 16 ) ) ] /* 1,0 */ ^               \
                 forward_T_box_2[ (int)( (tU8) ( column_in_1 >>  8 ) ) ] /* 2,1 */ ^               \
                 forward_T_box_3[ (int)( (tU8) ( column_in_2       ) ) ] /* 3,2 */ ^ round_key[3]; \
}

#else

// AES forward round (masked)
#define AES_FORWARD_ROUND(column_out_0,column_out_1,column_out_2,column_out_3,column_in_0,column_in_1,column_in_2,column_in_3) \
{                                                                                                                                                                \
  round_key += 4;                                                                                                                                                \
                                                                                                                                                                 \
  /* forward_shiftRows() via input structure (Sec.5.1.1 Fig.6) */                                                                                                \
  column_out_0 = forward_T_box_0[ ( (int)( (tU8) ( (column_in_0 ^ forward_mask[4]) >> 24  ) ) + 256 - (int)forward_shift[0] ) % 256 ] /* 0,0 */ ^               \
                 forward_T_box_1[ ( (int)( (tU8) ( (column_in_1 ^ forward_mask[4]) >> 16  ) ) + 256 - (int)forward_shift[1] ) % 256 ] /* 1,1 */ ^               \
                 forward_T_box_2[ ( (int)( (tU8) ( (column_in_2 ^ forward_mask[4]) >>  8  ) ) + 256 - (int)forward_shift[2] ) % 256 ] /* 2,2 */ ^               \
                 forward_T_box_3[ ( (int)( (tU8) ( (column_in_3 ^ forward_mask[4])        ) ) + 256 - (int)forward_shift[3] ) % 256 ] /* 3,3 */ ^ round_key[0]; \
                                                                                                                                                                 \
  column_out_1 = forward_T_box_0[ ( (int)( (tU8) ( (column_in_1 ^ forward_mask[4]) >> 24  ) ) + 256 - (int)forward_shift[0] ) % 256 ] /* 0,1 */ ^               \
                 forward_T_box_1[ ( (int)( (tU8) ( (column_in_2 ^ forward_mask[4]) >> 16  ) ) + 256 - (int)forward_shift[1] ) % 256 ] /* 1,2 */ ^               \
                 forward_T_box_2[ ( (int)( (tU8) ( (column_in_3 ^ forward_mask[4]) >>  8  ) ) + 256 - (int)forward_shift[2] ) % 256 ] /* 2,3 */ ^               \
                 forward_T_box_3[ ( (int)( (tU8) ( (column_in_0 ^ forward_mask[4])        ) ) + 256 - (int)forward_shift[3] ) % 256 ] /* 3,0 */ ^ round_key[1]; \
                                                                                                                                                                 \
  column_out_2 = forward_T_box_0[ ( (int)( (tU8) ( (column_in_2 ^ forward_mask[4]) >> 24  ) ) + 256 - (int)forward_shift[0] ) % 256 ] /* 0,2 */ ^               \
                 forward_T_box_1[ ( (int)( (tU8) ( (column_in_3 ^ forward_mask[4]) >> 16  ) ) + 256 - (int)forward_shift[1] ) % 256 ] /* 1,3 */ ^               \
                 forward_T_box_2[ ( (int)( (tU8) ( (column_in_0 ^ forward_mask[4]) >>  8  ) ) + 256 - (int)forward_shift[2] ) % 256 ] /* 2,0 */ ^               \
                 forward_T_box_3[ ( (int)( (tU8) ( (column_in_1 ^ forward_mask[4])        ) ) + 256 - (int)forward_shift[3] ) % 256 ] /* 3,1 */ ^ round_key[2]; \
                                                                                                                                                                 \
  column_out_3 = forward_T_box_0[ ( (int)( (tU8) ( (column_in_3 ^ forward_mask[4]) >> 24  ) ) + 256 - (int)forward_shift[0] ) % 256 ] /* 0,3 */ ^               \
                 forward_T_box_1[ ( (int)( (tU8) ( (column_in_0 ^ forward_mask[4]) >> 16  ) ) + 256 - (int)forward_shift[1] ) % 256 ] /* 1,0 */ ^               \
                 forward_T_box_2[ ( (int)( (tU8) ( (column_in_1 ^ forward_mask[4]) >>  8  ) ) + 256 - (int)forward_shift[2] ) % 256 ] /* 2,1 */ ^               \
                 forward_T_box_3[ ( (int)( (tU8) ( (column_in_2 ^ forward_mask[4])        ) ) + 256 - (int)forward_shift[3] ) % 256 ] /* 3,2 */ ^ round_key[3]; \
}

#endif


#if ( BPCL_AES_FEAT_DO_MASKING == 0 )

// AES inverse round (unmasked)
#define AES_INVERSE_ROUND(column_out_0,column_out_1,column_out_2,column_out_3,column_in_0,column_in_1,column_in_2,column_in_3) \
{                                                                                                   \
  round_key += 4;                                                                                   \
                                                                                                    \
  /* InvShiftRows() via input structure (Sec.5.3.1 Fig.13) */                                       \
  column_out_0 = inverse_T_box_0[ (int)( (tU8) ( column_in_0 >> 24 ) ) ] /* 0,0 */ ^               \
                 inverse_T_box_1[ (int)( (tU8) ( column_in_3 >> 16 ) ) ] /* 1,3 */ ^               \
                 inverse_T_box_2[ (int)( (tU8) ( column_in_2 >>  8 ) ) ] /* 2,2 */ ^               \
                 inverse_T_box_3[ (int)( (tU8) ( column_in_1       ) ) ] /* 3,1 */ ^ round_key[0]; \
                                                                                                    \
  column_out_1 = inverse_T_box_0[ (int)( (tU8) ( column_in_1 >> 24 ) ) ] /* 0,1 */ ^               \
                 inverse_T_box_1[ (int)( (tU8) ( column_in_0 >> 16 ) ) ] /* 1,0 */ ^               \
                 inverse_T_box_2[ (int)( (tU8) ( column_in_3 >>  8 ) ) ] /* 2,3 */ ^               \
                 inverse_T_box_3[ (int)( (tU8) ( column_in_2       ) ) ] /* 3,2 */ ^ round_key[1]; \
                                                                                                    \
  column_out_2 = inverse_T_box_0[ (int)( (tU8) ( column_in_2 >> 24 ) ) ] /* 0,2 */ ^               \
                 inverse_T_box_1[ (int)( (tU8) ( column_in_1 >> 16 ) ) ] /* 1,1 */ ^               \
                 inverse_T_box_2[ (int)( (tU8) ( column_in_0 >>  8 ) ) ] /* 2,0 */ ^               \
                 inverse_T_box_3[ (int)( (tU8) ( column_in_3       ) ) ] /* 3,3 */ ^ round_key[2]; \
                                                                                                    \
  column_out_3 = inverse_T_box_0[ (int)( (tU8) ( column_in_3 >> 24 ) ) ] /* 0,3 */ ^               \
                 inverse_T_box_1[ (int)( (tU8) ( column_in_2 >> 16 ) ) ] /* 1,2 */ ^               \
                 inverse_T_box_2[ (int)( (tU8) ( column_in_1 >>  8 ) ) ] /* 2,1 */ ^               \
                 inverse_T_box_3[ (int)( (tU8) ( column_in_0       ) ) ] /* 3,0 */ ^ round_key[3]; \
}

#else

// AES inverse round (masked)
#define AES_INVERSE_ROUND(column_out_0,column_out_1,column_out_2,column_out_3,column_in_0,column_in_1,column_in_2,column_in_3) \
{                                                                                                                                                               \
  round_key += 4;                                                                                                                                               \
                                                                                                                                                                \
  /* ShiftRows() via input structure (Sec.5.1.1 Fig.6) */                                                                                                       \
  column_out_0 = inverse_T_box_0[ ( (int)( (tU8) ( (column_in_0 ^ inverse_mask[4]) >> 24 ) ) + 256 - (int)inverse_shift[0] ) % 256 ] /* 0,0 */ ^               \
                 inverse_T_box_1[ ( (int)( (tU8) ( (column_in_3 ^ inverse_mask[4]) >> 16 ) ) + 256 - (int)inverse_shift[1] ) % 256 ] /* 1,3 */ ^               \
                 inverse_T_box_2[ ( (int)( (tU8) ( (column_in_2 ^ inverse_mask[4]) >>  8 ) ) + 256 - (int)inverse_shift[2] ) % 256 ] /* 2,2 */ ^               \
                 inverse_T_box_3[ ( (int)( (tU8) ( (column_in_1 ^ inverse_mask[4])       ) ) + 256 - (int)inverse_shift[3] ) % 256 ] /* 3,1 */ ^ round_key[0]; \
                                                                                                                                                                \
  column_out_1 = inverse_T_box_0[ ( (int)( (tU8) ( (column_in_1 ^ inverse_mask[4]) >> 24 ) ) + 256 - (int)inverse_shift[0] ) % 256 ] /* 0,1 */ ^               \
                 inverse_T_box_1[ ( (int)( (tU8) ( (column_in_0 ^ inverse_mask[4]) >> 16 ) ) + 256 - (int)inverse_shift[1] ) % 256 ] /* 1,0 */ ^               \
                 inverse_T_box_2[ ( (int)( (tU8) ( (column_in_3 ^ inverse_mask[4]) >>  8 ) ) + 256 - (int)inverse_shift[2] ) % 256 ] /* 2,3 */ ^               \
                 inverse_T_box_3[ ( (int)( (tU8) ( (column_in_2 ^ inverse_mask[4])       ) ) + 256 - (int)inverse_shift[3] ) % 256 ] /* 3,2 */ ^ round_key[1]; \
                                                                                                                                                                \
  column_out_2 = inverse_T_box_0[ ( (int)( (tU8) ( (column_in_2 ^ inverse_mask[4]) >> 24 ) ) + 256 - (int)inverse_shift[0] ) % 256 ] /* 0,2 */ ^               \
                 inverse_T_box_1[ ( (int)( (tU8) ( (column_in_1 ^ inverse_mask[4]) >> 16 ) ) + 256 - (int)inverse_shift[1] ) % 256 ] /* 1,1 */ ^               \
                 inverse_T_box_2[ ( (int)( (tU8) ( (column_in_0 ^ inverse_mask[4]) >>  8 ) ) + 256 - (int)inverse_shift[2] ) % 256 ] /* 2,0 */ ^               \
                 inverse_T_box_3[ ( (int)( (tU8) ( (column_in_3 ^ inverse_mask[4])       ) ) + 256 - (int)inverse_shift[3] ) % 256 ] /* 3,3 */ ^ round_key[2]; \
                                                                                                                                                                \
  column_out_3 = inverse_T_box_0[ ( (int)( (tU8) ( (column_in_3 ^ inverse_mask[4]) >> 24 ) ) + 256 - (int)inverse_shift[0] ) % 256 ] /* 0,3 */ ^               \
                 inverse_T_box_1[ ( (int)( (tU8) ( (column_in_2 ^ inverse_mask[4]) >> 16 ) ) + 256 - (int)inverse_shift[1] ) % 256 ] /* 1,2 */ ^               \
                 inverse_T_box_2[ ( (int)( (tU8) ( (column_in_1 ^ inverse_mask[4]) >>  8 ) ) + 256 - (int)inverse_shift[2] ) % 256 ] /* 2,1 */ ^               \
                 inverse_T_box_3[ ( (int)( (tU8) ( (column_in_0 ^ inverse_mask[4])       ) ) + 256 - (int)inverse_shift[3] ) % 256 ] /* 3,0 */ ^ round_key[3]; \
}

#endif


/***************************************************************************
 * 3. DEFINITIONS                                                          *
 ***************************************************************************/

// InvMixColumns() computation tables flag
static int inv_mix_cols_tables_ready = 0;

// InvMixColumns() computation tables
static tU32 inv_mix_cols_table_0[256];
static tU32 inv_mix_cols_table_1[256];
static tU32 inv_mix_cols_table_2[256];
static tU32 inv_mix_cols_table_3[256];

// LFSR primitive polynomials from NIST for 128-bit x^128 + x^7 + x^2 + x + 1 = x^128 + 0x87
#define C_LFSR_LOW_ORDER_PORTION_POLYNOMIAL 0x87
#define C_LFSR_HIGH_ORDER_PORTION_POLYNOMIAL 0x80


#if ( BPCL_AES_FEAT_DO_MASKING == 1 )

  // two 5  zero initialized 32-bit masks
  static tU32 forward_mask[5] = { 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL };
  static tU32 inverse_mask[5] = { 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL };

  // two 4  zero initialized 8-bit shifts
  static tU8 forward_shift[4] = { (tU8) 0, (tU8) 0, (tU8) 0, (tU8) 0 };
  static tU8 inverse_shift[4] = { (tU8) 0, (tU8) 0, (tU8) 0, (tU8) 0 };

  static tU8 block_counter = 0;

#endif

/***************************************************************************
 * 4. IMPLEMENTATION OF FUNCTIONS                                          *
 ***************************************************************************/

/***************************************************************************/
/*!
   \par       FUNCTION F-AES/05: aes_compute_t_boxes

   \par       DESCRIPTION

              This function initializes the AES forward & inverse S-
              and T-boxes dynamic or fixed

   \par       EQUATION / REFERENCE

              FIPS-197 Sec.5.1

   \par       INPUT ARGUMENTS

   \param     None

   \par       OUTPUT ARGUMENTS

   \param     None

   \par       RETURN VALUE

   \param     None

   \par       NOTES / WARNINGS / TODO's

   \note      Internal only!
*/
/***************************************************************************/
#if ( BPCL_AES_FEAT_FIXED_T_BOXES == 0 )

// forward S-box & T-boxes 
static tU32 forward_S_box[256];
static tU32 forward_T_box_0[256]; 
static tU32 forward_T_box_1[256]; 
static tU32 forward_T_box_2[256]; 
static tU32 forward_T_box_3[256]; 

// inverse S-box & T-boxes 
static tU32 inverse_S_box[256];
static tU32 inverse_T_box_0[256];
static tU32 inverse_T_box_1[256];
static tU32 inverse_T_box_2[256];
static tU32 inverse_T_box_3[256];

// round constants 
static tU32 rcon[10];

// t_boxes computation flag 
static int t_boxes_ready = 0;

// rotate a 32-bit word right by 8-bit including 32-bit masking
#define ROTATE_8_RIGHT(x) ( (((x) << 24) & 0xFFFFFFFF) | (((x) & 0xFFFFFFFF) >>  8) )

// rotate a 8-bit word left by 1-bit including 8-bit casting
#define ROTATE_BYTE_1_LEFT(x) (tU8) ( ( (x) << 1 ) | ( (x) >> 7 ) ) 

// x = x * 2 mod m(x) where x of GF(2^8) (see Sec. 4.2.1 FIPS-197)
// => x = ( x << 1 ) XOR { 0x00 if (x7 = 0) else 0x1B (x7 = 1) } 
#define XTIME(x) ( ( x << 1 ) ^ ( ( (x & 0x80) != (tU8) 0 ) ? 0x1B : 0x00 ) )

// x = x * y mod m(x) where x of GF(2^8)
// => if ( x != 0 AND y != 0 ) then ( x * y = 2^(log2(x) + log2(y)) = x * y in GF(2^8) ) else ( x * y = 0)
#define MUL(x,y) ( ( (x != 0) && (y != 0) ) ? pow2[(int)(log2[x] + log2[y]) % 255] : (tU8) 0 )

// dynamic T-boxes computation
static void aes_compute_t_boxes( void )
{
  int i;
  tU8 x, y;
  tU8 pow2[256];
  tU8 log2[256];

  // compute pow2 and log2 tables over GF(2^8) using XTIME (see Sec. 4.2.1 FIPS-197)
  x = (tU8) 1;
  for ( i = 0; i < 256; i++ )
  {
    pow2[i] = x;
    log2[(int)x] = (tU8) ( i );
    x ^= (tU8) ( XTIME( x ) );  // x = x * 2 on GF(2^8) with m(x) = x^8 + x^4 + x^3 + x + 1
  }

  // calculate the round constants: rcon[i] = [w32] = [ 2^i in GF(2^8) ]-[ 0 ]-[ 0 ]-[ 0 ]
  x = (tU8) 1;
  for ( i = 0; i < 10; i++ )
  {
    rcon[i] = (tU32) x << 24; // rcon = x << 24
    x = (tU8) ( XTIME( x ) ); // x = x * 2 in GF(2^8)
  }

  // generate the forward and inverse S-boxes (see Sec. 5.1.1 FIPS-197)
  forward_S_box[0x00] = 0x63; // value not generated by the following loop
  inverse_S_box[0x63] = 0x00; // value not generated by the following loop

  for ( i = 1; i < 256; i++ )
  {
    // multiplicative inverse in GF(2^8)
    x = pow2[ 255 - (int)log2[i] ]; // x = 2^255 / 2^log2(i) =  = 1/i in GF(2^8)

    // affine transformation formula 5.1 FIPS-197
    y = x;                         
    y = ROTATE_BYTE_1_LEFT(y);      
    x ^= y;                    // x[i] = x[i] XOR x[i+7 mod 8]
    y = ROTATE_BYTE_1_LEFT(y);
    x ^= y;                    // x[i] = x[i] XOR x[i+6 mod 8]  
    y = ROTATE_BYTE_1_LEFT(y);
    x ^= y;                    // x[i] = x[i] XOR x[i+5 mod 8]  
    y = ROTATE_BYTE_1_LEFT(y);
    x ^= y ^ 0x63;             // x[i] = x[i] XOR x[i+4 mod 8] XOR c[i] = 0x63

    // set forward & inverse S-box values
    forward_S_box[i] = (tU32) x;
    inverse_S_box[(int)x] = (tU32) i;
  }

  // generate the forward and inverse T-boxes (see Sec.5.1.2 & 5.1.3 FIPS-197)
  for ( i = 0; i < 256; i++ )
  {
    // ---- SubBytes() ---------
    x = (tU8) ( forward_S_box[i] ); // x = SubBytes(i)
    y = (tU8) ( XTIME( x ) );       // y = 2 * x in GF(2^8) 

    // ---- ShiftRows() --------

    // via input structure

    // ---- MixColumns() -------

    // [w32] = [2*x]-[1*x]-[1*x]-[3*x]
    forward_T_box_0[i] = ( (tU32) (x ^ y) ) ^   
                         ( (tU32) x <<  8 ) ^
                         ( (tU32) x << 16 ) ^
                         ( (tU32) y << 24 );

    // mask to 32-bit values
    forward_T_box_0[i] &= 0xFFFFFFFF;

    // [w32] = [3*x]-[2*x]-[1*x]-[1*x]
    forward_T_box_1[i] = ROTATE_8_RIGHT( forward_T_box_0[i] );

    // [w32] = [1*x]-[3*x]-[2*x]-[1*x]
    forward_T_box_2[i] = ROTATE_8_RIGHT( forward_T_box_1[i] );

    // [w32] = [1*x]-[1*x]-[3*x]-[2*x]
    forward_T_box_3[i] = ROTATE_8_RIGHT( forward_T_box_2[i] );

    // ---- InvSubBytes() ------

    y = (tU8) ( inverse_S_box[i] );

    // ---- InvShiftRows() -----

    // via input structure

    // ---- InvMixColumns() ----

    // [w32] = [0e*y]-[09*y]-[0d*y]-[0b*y]
     inverse_T_box_0[i] = ( (tU32) MUL( 0x0B, (int) y )       ) ^
                          ( (tU32) MUL( 0x0D, (int) y ) <<  8 ) ^
                          ( (tU32) MUL( 0x09, (int) y ) << 16 ) ^
                          ( (tU32) MUL( 0x0E, (int) y ) << 24 );

     // mask to 32-bit values
     inverse_T_box_0[i] &= 0xFFFFFFFF;

     // [w32] = [0b*y]-[0e*y]-[09*y]-[0d*y]
     inverse_T_box_1[i] = ROTATE_8_RIGHT( inverse_T_box_0[i] );

     // [w32] = [0d*y]-[0b*y]-[0e*y]-[09*y]
     inverse_T_box_2[i] = ROTATE_8_RIGHT( inverse_T_box_1[i] );

     // [w32] = [09*y]-[0d*y]-[0b*y]-[0e*y]
     inverse_T_box_3[i] = ROTATE_8_RIGHT( inverse_T_box_2[i] );
  }
}

#else // #if ( BPCL_AES_FEAT_FIXED_T_BOXES == 0 )

  
// forward S-box = SubBytes() transformation
static const tU32 forward_S_box[256] =
{
  0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5,
  0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
  0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0,
  0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
  0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC,
  0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
  0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A,
  0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
  0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0,
  0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
  0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B,
  0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
  0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85,
  0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
  0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5,
  0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
  0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17,
  0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
  0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88,
  0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
  0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C,
  0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
  0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9,
  0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
  0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6,
  0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
  0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E,
  0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
  0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94,
  0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
  0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68,
  0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
};


// forward T-box including SubBytes() & MixColumns()
/*lint -e(146)*/ /*As binary constant is not intended*/
#define FORWARD_T_BOX                                             \
                                                                  \
  V(C6,63,63,A5), V(F8,7C,7C,84), V(EE,77,77,99), V(F6,7B,7B,8D), \
  V(FF,F2,F2,0D), V(D6,6B,6B,BD), V(DE,6F,6F,B1), V(91,C5,C5,54), \
  V(60,30,30,50), V(02,01,01,03), V(CE,67,67,A9), V(56,2B,2B,7D), \
  V(E7,FE,FE,19), V(B5,D7,D7,62), V(4D,AB,AB,E6), V(EC,76,76,9A), \
  V(8F,CA,CA,45), V(1F,82,82,9D), V(89,C9,C9,40), V(FA,7D,7D,87), \
  V(EF,FA,FA,15), V(B2,59,59,EB), V(8E,47,47,C9), V(FB,F0,F0,0B), \
  V(41,AD,AD,EC), V(B3,D4,D4,67), V(5F,A2,A2,FD), V(45,AF,AF,EA), \
  V(23,9C,9C,BF), V(53,A4,A4,F7), V(E4,72,72,96), V(9B,C0,C0,5B), \
  V(75,B7,B7,C2), V(E1,FD,FD,1C), V(3D,93,93,AE), V(4C,26,26,6A), \
  V(6C,36,36,5A), V(7E,3F,3F,41), V(F5,F7,F7,02), V(83,CC,CC,4F), \
  V(68,34,34,5C), V(51,A5,A5,F4), V(D1,E5,E5,34), V(F9,F1,F1,08), \
  V(E2,71,71,93), V(AB,D8,D8,73), V(62,31,31,53), V(2A,15,15,3F), \
  V(08,04,04,0C), V(95,C7,C7,52), V(46,23,23,65), V(9D,C3,C3,5E), \
  V(30,18,18,28), V(37,96,96,A1), V(0A,05,05,0F), V(2F,9A,9A,B5), \
  V(0E,07,07,09), V(24,12,12,36), V(1B,80,80,9B), V(DF,E2,E2,3D), \
  V(CD,EB,EB,26), V(4E,27,27,69), V(7F,B2,B2,CD), V(EA,75,75,9F), \
  V(12,09,09,1B), V(1D,83,83,9E), V(58,2C,2C,74), V(34,1A,1A,2E), \
  V(36,1B,1B,2D), V(DC,6E,6E,B2), V(B4,5A,5A,EE), V(5B,A0,A0,FB), \
  V(A4,52,52,F6), V(76,3B,3B,4D), V(B7,D6,D6,61), V(7D,B3,B3,CE), \
  V(52,29,29,7B), V(DD,E3,E3,3E), V(5E,2F,2F,71), V(13,84,84,97), \
  V(A6,53,53,F5), V(B9,D1,D1,68), V(00,00,00,00), V(C1,ED,ED,2C), \
  V(40,20,20,60), V(E3,FC,FC,1F), V(79,B1,B1,C8), V(B6,5B,5B,ED), \
  V(D4,6A,6A,BE), V(8D,CB,CB,46), V(67,BE,BE,D9), V(72,39,39,4B), \
  V(94,4A,4A,DE), V(98,4C,4C,D4), V(B0,58,58,E8), V(85,CF,CF,4A), \
  V(BB,D0,D0,6B), V(C5,EF,EF,2A), V(4F,AA,AA,E5), V(ED,FB,FB,16), \
  V(86,43,43,C5), V(9A,4D,4D,D7), V(66,33,33,55), V(11,85,85,94), \
  V(8A,45,45,CF), V(E9,F9,F9,10), V(04,02,02,06), V(FE,7F,7F,81), \
  V(A0,50,50,F0), V(78,3C,3C,44), V(25,9F,9F,BA), V(4B,A8,A8,E3), \
  V(A2,51,51,F3), V(5D,A3,A3,FE), V(80,40,40,C0), V(05,8F,8F,8A), \
  V(3F,92,92,AD), V(21,9D,9D,BC), V(70,38,38,48), V(F1,F5,F5,04), \
  V(63,BC,BC,DF), V(77,B6,B6,C1), V(AF,DA,DA,75), V(42,21,21,63), \
  V(20,10,10,30), V(E5,FF,FF,1A), V(FD,F3,F3,0E), V(BF,D2,D2,6D), \
  V(81,CD,CD,4C), V(18,0C,0C,14), V(26,13,13,35), V(C3,EC,EC,2F), \
  V(BE,5F,5F,E1), V(35,97,97,A2), V(88,44,44,CC), V(2E,17,17,39), \
  V(93,C4,C4,57), V(55,A7,A7,F2), V(FC,7E,7E,82), V(7A,3D,3D,47), \
  V(C8,64,64,AC), V(BA,5D,5D,E7), V(32,19,19,2B), V(E6,73,73,95), \
  V(C0,60,60,A0), V(19,81,81,98), V(9E,4F,4F,D1), V(A3,DC,DC,7F), \
  V(44,22,22,66), V(54,2A,2A,7E), V(3B,90,90,AB), V(0B,88,88,83), \
  V(8C,46,46,CA), V(C7,EE,EE,29), V(6B,B8,B8,D3), V(28,14,14,3C), \
  V(A7,DE,DE,79), V(BC,5E,5E,E2), V(16,0B,0B,1D), V(AD,DB,DB,76), \
  V(DB,E0,E0,3B), V(64,32,32,56), V(74,3A,3A,4E), V(14,0A,0A,1E), \
  V(92,49,49,DB), V(0C,06,06,0A), V(48,24,24,6C), V(B8,5C,5C,E4), \
  V(9F,C2,C2,5D), V(BD,D3,D3,6E), V(43,AC,AC,EF), V(C4,62,62,A6), \
  V(39,91,91,A8), V(31,95,95,A4), V(D3,E4,E4,37), V(F2,79,79,8B), \
  V(D5,E7,E7,32), V(8B,C8,C8,43), V(6E,37,37,59), V(DA,6D,6D,B7), \
  V(01,8D,8D,8C), V(B1,D5,D5,64), V(9C,4E,4E,D2), V(49,A9,A9,E0), \
  V(D8,6C,6C,B4), V(AC,56,56,FA), V(F3,F4,F4,07), V(CF,EA,EA,25), \
  V(CA,65,65,AF), V(F4,7A,7A,8E), V(47,AE,AE,E9), V(10,08,08,18), \
  V(6F,BA,BA,D5), V(F0,78,78,88), V(4A,25,25,6F), V(5C,2E,2E,72), \
  V(38,1C,1C,24), V(57,A6,A6,F1), V(73,B4,B4,C7), V(97,C6,C6,51), \
  V(CB,E8,E8,23), V(A1,DD,DD,7C), V(E8,74,74,9C), V(3E,1F,1F,21), \
  V(96,4B,4B,DD), V(61,BD,BD,DC), V(0D,8B,8B,86), V(0F,8A,8A,85), \
  V(E0,70,70,90), V(7C,3E,3E,42), V(71,B5,B5,C4), V(CC,66,66,AA), \
  V(90,48,48,D8), V(06,03,03,05), V(F7,F6,F6,01), V(1C,0E,0E,12), \
  V(C2,61,61,A3), V(6A,35,35,5F), V(AE,57,57,F9), V(69,B9,B9,D0), \
  V(17,86,86,91), V(99,C1,C1,58), V(3A,1D,1D,27), V(27,9E,9E,B9), \
  V(D9,E1,E1,38), V(EB,F8,F8,13), V(2B,98,98,B3), V(22,11,11,33), \
  V(D2,69,69,BB), V(A9,D9,D9,70), V(07,8E,8E,89), V(33,94,94,A7), \
  V(2D,9B,9B,B6), V(3C,1E,1E,22), V(15,87,87,92), V(C9,E9,E9,20), \
  V(87,CE,CE,49), V(AA,55,55,FF), V(50,28,28,78), V(A5,DF,DF,7A), \
  V(03,8C,8C,8F), V(59,A1,A1,F8), V(09,89,89,80), V(1A,0D,0D,17), \
  V(65,BF,BF,DA), V(D7,E6,E6,31), V(84,42,42,C6), V(D0,68,68,B8), \
  V(82,41,41,C3), V(29,99,99,B0), V(5A,2D,2D,77), V(1E,0F,0F,11), \
  V(7B,B0,B0,CB), V(A8,54,54,FC), V(6D,BB,BB,D6), V(2C,16,16,3A)

#define V(a,b,c,d) 0x##a##b##c##d
/*lint -e{146}*/ /*As binary constant is not intended*/
static tU32 forward_T_box_0[256] = { FORWARD_T_BOX };
#undef V

#define V(a,b,c,d) 0x##d##a##b##c
/*lint -e{146}*/ /*As binary constant is not intended*/
static tU32 forward_T_box_1[256] = { FORWARD_T_BOX };
#undef V

#define V(a,b,c,d) 0x##c##d##a##b
/*lint -e{146}*/ /*As binary constant is not intended*/
static tU32 forward_T_box_2[256] = { FORWARD_T_BOX };
#undef V

#define V(a,b,c,d) 0x##b##c##d##a
/*lint -e{146}*/ /*As binary constant is not intended*/
static tU32 forward_T_box_3[256] = { FORWARD_T_BOX };
#undef V

#undef FORWARD_T_BOX


// inverse S-box = InvSubBytes() transformation
static const tU32 inverse_S_box[256] =
{
  0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38,
  0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB,
  0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87,
  0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB,
  0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D,
  0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E,
  0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2,
  0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25,
  0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16,
  0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92,
  0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA,
  0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84,
  0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A,
  0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06,
  0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02,
  0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B,
  0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA,
  0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73,
  0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85,
  0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E,
  0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89,
  0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B,
  0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20,
  0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4,
  0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31,
  0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F,
  0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D,
  0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF,
  0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0,
  0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61,
  0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26,
  0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D
};


// inverse T-box including InvSubBytes() & InvMixColumns
/*lint -e(146)*/ /*As binary constant is not intended*/
#define INVERSE_T_BOX                                             \
                                                                  \
  V(51,F4,A7,50), V(7E,41,65,53), V(1A,17,A4,C3), V(3A,27,5E,96), \
  V(3B,AB,6B,CB), V(1F,9D,45,F1), V(AC,FA,58,AB), V(4B,E3,03,93), \
  V(20,30,FA,55), V(AD,76,6D,F6), V(88,CC,76,91), V(F5,02,4C,25), \
  V(4F,E5,D7,FC), V(C5,2A,CB,D7), V(26,35,44,80), V(B5,62,A3,8F), \
  V(DE,B1,5A,49), V(25,BA,1B,67), V(45,EA,0E,98), V(5D,FE,C0,E1), \
  V(C3,2F,75,02), V(81,4C,F0,12), V(8D,46,97,A3), V(6B,D3,F9,C6), \
  V(03,8F,5F,E7), V(15,92,9C,95), V(BF,6D,7A,EB), V(95,52,59,DA), \
  V(D4,BE,83,2D), V(58,74,21,D3), V(49,E0,69,29), V(8E,C9,C8,44), \
  V(75,C2,89,6A), V(F4,8E,79,78), V(99,58,3E,6B), V(27,B9,71,DD), \
  V(BE,E1,4F,B6), V(F0,88,AD,17), V(C9,20,AC,66), V(7D,CE,3A,B4), \
  V(63,DF,4A,18), V(E5,1A,31,82), V(97,51,33,60), V(62,53,7F,45), \
  V(B1,64,77,E0), V(BB,6B,AE,84), V(FE,81,A0,1C), V(F9,08,2B,94), \
  V(70,48,68,58), V(8F,45,FD,19), V(94,DE,6C,87), V(52,7B,F8,B7), \
  V(AB,73,D3,23), V(72,4B,02,E2), V(E3,1F,8F,57), V(66,55,AB,2A), \
  V(B2,EB,28,07), V(2F,B5,C2,03), V(86,C5,7B,9A), V(D3,37,08,A5), \
  V(30,28,87,F2), V(23,BF,A5,B2), V(02,03,6A,BA), V(ED,16,82,5C), \
  V(8A,CF,1C,2B), V(A7,79,B4,92), V(F3,07,F2,F0), V(4E,69,E2,A1), \
  V(65,DA,F4,CD), V(06,05,BE,D5), V(D1,34,62,1F), V(C4,A6,FE,8A), \
  V(34,2E,53,9D), V(A2,F3,55,A0), V(05,8A,E1,32), V(A4,F6,EB,75), \
  V(0B,83,EC,39), V(40,60,EF,AA), V(5E,71,9F,06), V(BD,6E,10,51), \
  V(3E,21,8A,F9), V(96,DD,06,3D), V(DD,3E,05,AE), V(4D,E6,BD,46), \
  V(91,54,8D,B5), V(71,C4,5D,05), V(04,06,D4,6F), V(60,50,15,FF), \
  V(19,98,FB,24), V(D6,BD,E9,97), V(89,40,43,CC), V(67,D9,9E,77), \
  V(B0,E8,42,BD), V(07,89,8B,88), V(E7,19,5B,38), V(79,C8,EE,DB), \
  V(A1,7C,0A,47), V(7C,42,0F,E9), V(F8,84,1E,C9), V(00,00,00,00), \
  V(09,80,86,83), V(32,2B,ED,48), V(1E,11,70,AC), V(6C,5A,72,4E), \
  V(FD,0E,FF,FB), V(0F,85,38,56), V(3D,AE,D5,1E), V(36,2D,39,27), \
  V(0A,0F,D9,64), V(68,5C,A6,21), V(9B,5B,54,D1), V(24,36,2E,3A), \
  V(0C,0A,67,B1), V(93,57,E7,0F), V(B4,EE,96,D2), V(1B,9B,91,9E), \
  V(80,C0,C5,4F), V(61,DC,20,A2), V(5A,77,4B,69), V(1C,12,1A,16), \
  V(E2,93,BA,0A), V(C0,A0,2A,E5), V(3C,22,E0,43), V(12,1B,17,1D), \
  V(0E,09,0D,0B), V(F2,8B,C7,AD), V(2D,B6,A8,B9), V(14,1E,A9,C8), \
  V(57,F1,19,85), V(AF,75,07,4C), V(EE,99,DD,BB), V(A3,7F,60,FD), \
  V(F7,01,26,9F), V(5C,72,F5,BC), V(44,66,3B,C5), V(5B,FB,7E,34), \
  V(8B,43,29,76), V(CB,23,C6,DC), V(B6,ED,FC,68), V(B8,E4,F1,63), \
  V(D7,31,DC,CA), V(42,63,85,10), V(13,97,22,40), V(84,C6,11,20), \
  V(85,4A,24,7D), V(D2,BB,3D,F8), V(AE,F9,32,11), V(C7,29,A1,6D), \
  V(1D,9E,2F,4B), V(DC,B2,30,F3), V(0D,86,52,EC), V(77,C1,E3,D0), \
  V(2B,B3,16,6C), V(A9,70,B9,99), V(11,94,48,FA), V(47,E9,64,22), \
  V(A8,FC,8C,C4), V(A0,F0,3F,1A), V(56,7D,2C,D8), V(22,33,90,EF), \
  V(87,49,4E,C7), V(D9,38,D1,C1), V(8C,CA,A2,FE), V(98,D4,0B,36), \
  V(A6,F5,81,CF), V(A5,7A,DE,28), V(DA,B7,8E,26), V(3F,AD,BF,A4), \
  V(2C,3A,9D,E4), V(50,78,92,0D), V(6A,5F,CC,9B), V(54,7E,46,62), \
  V(F6,8D,13,C2), V(90,D8,B8,E8), V(2E,39,F7,5E), V(82,C3,AF,F5), \
  V(9F,5D,80,BE), V(69,D0,93,7C), V(6F,D5,2D,A9), V(CF,25,12,B3), \
  V(C8,AC,99,3B), V(10,18,7D,A7), V(E8,9C,63,6E), V(DB,3B,BB,7B), \
  V(CD,26,78,09), V(6E,59,18,F4), V(EC,9A,B7,01), V(83,4F,9A,A8), \
  V(E6,95,6E,65), V(AA,FF,E6,7E), V(21,BC,CF,08), V(EF,15,E8,E6), \
  V(BA,E7,9B,D9), V(4A,6F,36,CE), V(EA,9F,09,D4), V(29,B0,7C,D6), \
  V(31,A4,B2,AF), V(2A,3F,23,31), V(C6,A5,94,30), V(35,A2,66,C0), \
  V(74,4E,BC,37), V(FC,82,CA,A6), V(E0,90,D0,B0), V(33,A7,D8,15), \
  V(F1,04,98,4A), V(41,EC,DA,F7), V(7F,CD,50,0E), V(17,91,F6,2F), \
  V(76,4D,D6,8D), V(43,EF,B0,4D), V(CC,AA,4D,54), V(E4,96,04,DF), \
  V(9E,D1,B5,E3), V(4C,6A,88,1B), V(C1,2C,1F,B8), V(46,65,51,7F), \
  V(9D,5E,EA,04), V(01,8C,35,5D), V(FA,87,74,73), V(FB,0B,41,2E), \
  V(B3,67,1D,5A), V(92,DB,D2,52), V(E9,10,56,33), V(6D,D6,47,13), \
  V(9A,D7,61,8C), V(37,A1,0C,7A), V(59,F8,14,8E), V(EB,13,3C,89), \
  V(CE,A9,27,EE), V(B7,61,C9,35), V(E1,1C,E5,ED), V(7A,47,B1,3C), \
  V(9C,D2,DF,59), V(55,F2,73,3F), V(18,14,CE,79), V(73,C7,37,BF), \
  V(53,F7,CD,EA), V(5F,FD,AA,5B), V(DF,3D,6F,14), V(78,44,DB,86), \
  V(CA,AF,F3,81), V(B9,68,C4,3E), V(38,24,34,2C), V(C2,A3,40,5F), \
  V(16,1D,C3,72), V(BC,E2,25,0C), V(28,3C,49,8B), V(FF,0D,95,41), \
  V(39,A8,01,71), V(08,0C,B3,DE), V(D8,B4,E4,9C), V(64,56,C1,90), \
  V(7B,CB,84,61), V(D5,32,B6,70), V(48,6C,5C,74), V(D0,B8,57,42)

#define V(a,b,c,d) 0x##a##b##c##d
/*lint -e{146}*/ /*As binary constant is not intended*/
static tU32 inverse_T_box_0[256] = { INVERSE_T_BOX };
#undef V

#define V(a,b,c,d) 0x##d##a##b##c
/*lint -e{146}*/ /*As binary constant is not intended*/
static tU32 inverse_T_box_1[256] = { INVERSE_T_BOX };
#undef V

#define V(a,b,c,d) 0x##c##d##a##b
/*lint -e{146}*/ /*As binary constant is not intended*/
static tU32 inverse_T_box_2[256] = { INVERSE_T_BOX };
#undef V

#define V(a,b,c,d) 0x##b##c##d##a
/*lint -e{146}*/ /*As binary constant is not intended*/
static tU32 inverse_T_box_3[256] = { INVERSE_T_BOX };
#undef V

#undef INVERSE_T_BOX


// round constants = [w32] = [2^i in GF(2^8)]-[0]-[0]-[0]
static const tU32 rcon[10] =
{
  0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000,
  0x20000000, 0x40000000, 0x80000000, 0x1B000000, 0x36000000
};

// t_boxes computation flag 
static int t_boxes_ready = 1;


// empty t_boxes computation function for compatibility
static void aes_compute_t_boxes( void )
{
}


#endif // #elif ( BPCL_AES_FEAT_FIXED_T_BOXES == 1 )



/***************************************************************************/
/*!
   \par       FUNCTION F-AES/10: aes_set_key

   \par       DESCRIPTION

              This function does the AES key scheduling

   \par       EQUATION / REFERENCE

              FIPS-197 Sec.5.2

   \par       INPUT ARGUMENTS

   \param     (ctx) *aes_context         - pointer to initializing aes_context

   \param     const (tU8) *key          - pointer to key byte array

   \param     const (int) key_length     - length of key in bytes (only 16,24 or 32 possible)

   \param     const (tU8) IV            - 16 byte initialization vector

   \par       OUTPUT ARGUMENTS

   \param     (ctx) *aes_context         - pointer to initialized aes_context

   \par       RETURN VALUE

   \param     (int)  0                   - successful execution

   \param     (int) -1                   - parameter error

   \par       NOTES / WARNINGS / TODO's

   \note      Internal only!
*/
/***************************************************************************/
static int aes_set_key( /*@out@*/ aes_context *ctx,
                        const tU8 *key,
                        const tU32 key_length,
                        const tU8 *IV )
{
  // declarations
  int i;
  tU32 *enc_round_key, *dec_round_key;

  // compute T_boxes if necessary
  if ( t_boxes_ready == 0 )
  {
    aes_compute_t_boxes();
    t_boxes_ready = 1;
  }

  // set number of rounds (Nr) according to key_length (see Sec.5 in FIPS-197)
  switch( key_length )
  {
    case 16: ctx->number_rounds = 10; break;
    case 24: ctx->number_rounds = 12; break;
    case 32: ctx->number_rounds = 14; break;
    default: return -1;
  }

  // get pointer to encryption round keys
  enc_round_key = ctx->enc_round_keys;

  // enc_round_key[0..3/5/7] is the original key (first while in Fig.11 Sec.5.2 in FIPS-197)
  /*@+forloopexec@*/
  for ( i = 0; i < (int) (key_length >> 2); i++ )
  {
    M_BUF_TO_U32( enc_round_key[i], key, i * 4 );
  }
  /*@=forloopexec@*/
 
  // key expansion
  switch( key_length )
  {
    // 128-bit key
    case 16:

      // derive enc_round_key[4..43] from enc_round_key[0..3]
      for ( i = 0; i < 10; i++ )
      {
        // enc_round_key[i mod 4 == 0] = .. where i = this i * 4 (means i from FIPS-197 Fig.11)
        enc_round_key[4]  = ( enc_round_key[0]                                            ) ^ /* 32-bit w[i-4]                                                        */
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[3] >> 16 ) ) ] << 24 ) ^ /* 32-bit { S-box( enc_round_key[i-1]-a1 ),                             */
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[3] >>  8 ) ) ] << 16 ) ^ /*          S-box( enc_round_key[i-1]-a2 ),                             */
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[3]       ) ) ] <<  8 ) ^ /*          S-box( enc_round_key[i-1]-a3 ),                             */
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[3] >> 24 ) ) ]       ) ^ /*          S-box( enc_round_key[i-1]-a0 ) } = SubWord(RotWord(w[i-1])) */
                            ( rcon[i]                                                     );  /* 32-bit Rcon[i/4]                                                     */

        // enc_round_key[i mod 4 != 0] = w[i-4] XOR w[i-1], where i = this i * 4 (means i from FIPS-197 Fig.11)
        enc_round_key[5]  = enc_round_key[1] ^ enc_round_key[4];
        enc_round_key[6]  = enc_round_key[2] ^ enc_round_key[5];
        enc_round_key[7]  = enc_round_key[3] ^ enc_round_key[6];

        // next 4 32-bit words
        enc_round_key += 4;
      }
      break;

    // 192-bit key
    case 24:

      // build enc_round_key[6..53] from enc_round_key[0..5]
      for ( i = 0; i < 8; i++ )
      {
        // enc_round_key[i mod 6 == 0] = w[i-6] XOR SubWord(RotWord(w[i-1])) XOR Rcon[i/6]
        enc_round_key[6]  = ( enc_round_key[0]                                            ) ^
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[5] >> 16 ) ) ] << 24 ) ^
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[5] >>  8 ) ) ] << 16 ) ^
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[5]       ) ) ] <<  8 ) ^
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[5] >> 24 ) ) ]       ) ^
                            ( rcon[i]                                                           );

        // enc_round_key[i mod 6 != 0] = w[i-6] XOR w[i-1]
        enc_round_key[7]  = enc_round_key[1] ^ enc_round_key[6];
        enc_round_key[8]  = enc_round_key[2] ^ enc_round_key[7];
        enc_round_key[9]  = enc_round_key[3] ^ enc_round_key[8];
        enc_round_key[10] = enc_round_key[4] ^ enc_round_key[9];
        enc_round_key[11] = enc_round_key[5] ^ enc_round_key[10];

        // next 6 32-bit words
        enc_round_key += 6;
      }
      break;

    // 256-bit key
    case 32:

      // build enc_round_key[8..63] from enc_round_key[0..7]
      for ( i = 0; i < 7; i++ )
      {
        // enc_round_key[i mod 8 == 0] = w[i-8] XOR SubWord(RotWord(w[i-1])) XOR Rcon[i/8]
        enc_round_key[8]  = ( enc_round_key[0]                                            ) ^
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[7] >> 16 ) ) ] << 24 ) ^
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[7] >>  8 ) ) ] << 16 ) ^
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[7]       ) ) ] <<  8 ) ^
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[7] >> 24 ) ) ]       ) ^
                            ( rcon[i]                                                     );

        // enc_round_key[i mod 8 != 0 && i mod 8 != 4] = w[i-8] XOR w[i-1]
        enc_round_key[9]  = enc_round_key[1] ^ enc_round_key[8];
        enc_round_key[10] = enc_round_key[2] ^ enc_round_key[9];
        enc_round_key[11] = enc_round_key[3] ^ enc_round_key[10];

        // enc_round_key[i mod 8 == 4] = w[i-8] XOR SubWord(w[i-1])
        enc_round_key[12] = ( enc_round_key[4]                                             ) ^
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[11] >> 24 ) ) ] << 24 ) ^
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[11] >> 16 ) ) ] << 16 ) ^
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[11] >>  8 ) ) ] <<  8 ) ^
                            ( forward_S_box[ (int)( (tU8) ( enc_round_key[11]       ) ) ]       );

        // enc_round_key[i mod 8 != 0 && i mod 8 != 4] = w[i-8] XOR w[i-1]
        enc_round_key[13] = enc_round_key[5] ^ enc_round_key[12];
        enc_round_key[14] = enc_round_key[6] ^ enc_round_key[13];
        enc_round_key[15] = enc_round_key[7] ^ enc_round_key[14];

        // next 8 32-bit words
        enc_round_key += 8;
      }
      break;
  }

  // --- decryption schedule ----------------------------------------

  // setup InvMixColumns tables if necessary
  if ( inv_mix_cols_tables_ready == 0 )
  {
    // InvMixColumns() <=> eliminate SubBytes() in inverse_T_box 
    for ( i = 0; i < 256; i++ )
    {
      inv_mix_cols_table_0[i] = inverse_T_box_0[ forward_S_box[i] ];
      inv_mix_cols_table_1[i] = inverse_T_box_1[ forward_S_box[i] ];
      inv_mix_cols_table_2[i] = inverse_T_box_2[ forward_S_box[i] ];
      inv_mix_cols_table_3[i] = inverse_T_box_3[ forward_S_box[i] ];
    }

    inv_mix_cols_tables_ready = 1;
  }

  // --- decryption key expansion acc. to Fig.15 in FIPS-197 --------

  // get pointer to decryption round keys
  dec_round_key = ctx->dec_round_keys;
  
  // dec_round_key[0..3] = enc_round_key[40..43/48..51/56..59]
  dec_round_key[0] = *enc_round_key++;
  dec_round_key[1] = *enc_round_key++;
  dec_round_key[2] = *enc_round_key++;
  dec_round_key[3] = *enc_round_key++;
  dec_round_key += 4;

  // dec_round_key[i = 4..4*(number_rounds-1)] =  InvMixColumns( enc_round_key[i-8..i-4] )
  for ( i = 1; i < ctx->number_rounds; i++ )
  {
    enc_round_key -= 8;

    // dec_round_key[4..] = InvMixColumns(enc_round_key[x-8])
    *dec_round_key++ = inv_mix_cols_table_0[ (int)( (tU8) ( *enc_round_key >> 24 ) ) ] ^
                       inv_mix_cols_table_1[ (int)( (tU8) ( *enc_round_key >> 16 ) ) ] ^
                       inv_mix_cols_table_2[ (int)( (tU8) ( *enc_round_key >>  8 ) ) ] ^
                       inv_mix_cols_table_3[ (int)( (tU8) ( *enc_round_key       ) ) ]; 
    enc_round_key++;

    // dec_round_key[5..] = InvMixColumns(enc_round_key[x-7])
    *dec_round_key++ = inv_mix_cols_table_0[ (int)( (tU8) ( *enc_round_key >> 24 ) ) ] ^
                       inv_mix_cols_table_1[ (int)( (tU8) ( *enc_round_key >> 16 ) ) ] ^
                       inv_mix_cols_table_2[ (int)( (tU8) ( *enc_round_key >>  8 ) ) ] ^
                       inv_mix_cols_table_3[ (int)( (tU8) ( *enc_round_key       ) ) ];
    enc_round_key++;

    // dec_round_key[6..] = InvMixColumns(enc_round_key[x-6])
    *dec_round_key++ = inv_mix_cols_table_0[ (int)( (tU8) ( *enc_round_key >> 24 ) ) ] ^
                       inv_mix_cols_table_1[ (int)( (tU8) ( *enc_round_key >> 16 ) ) ] ^
                       inv_mix_cols_table_2[ (int)( (tU8) ( *enc_round_key >>  8 ) ) ] ^
                       inv_mix_cols_table_3[ (int)( (tU8) ( *enc_round_key       ) ) ];
    enc_round_key++;

    // dec_round_key[7..] = InvMixColumns(enc_round_key[x-5])
    *dec_round_key++ = inv_mix_cols_table_0[ (int)( (tU8) ( *enc_round_key >> 24 ) ) ] ^
                       inv_mix_cols_table_1[ (int)( (tU8) ( *enc_round_key >> 16 ) ) ] ^
                       inv_mix_cols_table_2[ (int)( (tU8) ( *enc_round_key >>  8 ) ) ] ^
                       inv_mix_cols_table_3[ (int)( (tU8) ( *enc_round_key       ) ) ];
    enc_round_key++;
  }

  enc_round_key -= 8;

  // dec_round_key[40..43/48..51/56..59] = enc_round_key[0..3]
  *dec_round_key++ = *enc_round_key++;
  *dec_round_key++ = *enc_round_key++;
  *dec_round_key++ = *enc_round_key++;
  *dec_round_key++ = *enc_round_key++;

  // set initialization vector
  if ( IV != NULL )
  {
    for ( i = 0; i < 16; i++ )  ctx->IV[i] = IV[i];
  }
  else
  {
    for ( i = 0; i < 16; i++ )  ctx->IV[i] = (tU8) 0;
  }

  // reset block_counter
 #if ( BPCL_AES_FEAT_DO_MASKING == 1 )
   
   block_counter = 0;

#endif

  // successful
  return  0;
}


/***************************************************************************/
/*!
   \par       FUNCTION F-AES/15: aes_mask_tables

   \par       DESCRIPTION

              This function masks the AES forward & inverse S-
              and T-boxes dynamic or fixed

   \par       EQUATION / REFERENCE

              None

   \par       INPUT ARGUMENTS

   \param     (int) tables               - mask forward (1) or inverse (2) tables

   \par       OUTPUT ARGUMENTS

   \param     None

   \par       RETURN VALUE

   \param     None

   \par       NOTES / WARNINGS / TODO's

   \note      Internal only!
*/
/***************************************************************************/
#if ( BPCL_AES_FEAT_DO_MASKING == 1 )

static void aes_mask_tables( int tables )
{

  // declarations
  tU32 Temp_box[256];
  tU32 old_mask[4];
  tU8  old_shift[4];
  int i, delta;
  
  // if new forward mask requested
  if ( tables == 1 )
  { 
    // save current forward shift
    old_shift[0] = forward_shift[0];
    old_shift[1] = forward_shift[1];
    old_shift[2] = forward_shift[2];
    old_shift[3] = forward_shift[3];

    // set new forward_shift
    forward_shift[0] = (tU8) ( rnd_word( 0xFF ) );
    forward_shift[1] = (tU8) ( rnd_word( 0xFF ) );
    forward_shift[2] = (tU8) ( rnd_word( 0xFF ) );
    forward_shift[3] = (tU8) ( rnd_word( 0xFF ) );

    // forward_shift tables
    /*@+forloopexec@*/
    delta =  ( 256 - (int) old_shift[0] + (int) forward_shift[0] ) % 256;
    for( i = 0; i < delta; i++ ) Temp_box[i] = forward_T_box_0[i];
    for( i = 0; i < (256 - delta); i++ ) forward_T_box_0[i] = forward_T_box_0[i + delta];
    for( i = (256 - delta); i < 256; i++ ) forward_T_box_0[i] = Temp_box[i - 256 + delta];

    delta =  ( 256 - (int) old_shift[1] + (int) forward_shift[1] ) % 256;
    for( i = 0; i < delta; i++ ) Temp_box[i] = forward_T_box_1[i];
    for( i = 0; i < (256 - delta); i++ ) forward_T_box_1[i] = forward_T_box_1[i + delta];
    for( i = (256 - delta); i < 256; i++ ) forward_T_box_1[i] = Temp_box[i - 256 + delta];

    delta =  ( 256 - (int) old_shift[2] + (int) forward_shift[2] ) % 256;
    for( i = 0; i < delta; i++ ) Temp_box[i] = forward_T_box_2[i];
    for( i = 0; i < (256 - delta); i++ ) forward_T_box_2[i] = forward_T_box_2[i + delta];
    for( i = (256 - delta); i < 256; i++ ) forward_T_box_2[i] = Temp_box[i - 256 + delta];

    delta =  ( 256 - (int) old_shift[3] + (int) forward_shift[3] ) % 256;
    for( i = 0; i < delta; i++ ) Temp_box[i] = forward_T_box_3[i];
    for( i = 0; i < (256 - delta); i++ ) forward_T_box_3[i] = forward_T_box_3[i + delta];
    for( i = (256 - delta); i < 256; i++ ) forward_T_box_3[i] = Temp_box[i - 256 + delta];
    /*@=forloopexec@*/

    // save current forward_masks
    old_mask[0] = forward_mask[0];
    old_mask[1] = forward_mask[1];
    old_mask[2] = forward_mask[2];
    old_mask[3] = forward_mask[3];

    // build new forward_masks
    forward_mask[0] = rnd_word( 0xFFFFFFFF ); // m0
    forward_mask[1] = rnd_word( 0xFFFFFFFF ); // m1
    forward_mask[2] = rnd_word( 0xFFFFFFFF ); // m2
    forward_mask[3] = rnd_word( 0xFFFFFFFF ); // m3
    forward_mask[4] = forward_mask[0] ^ forward_mask[1] ^ forward_mask[2] ^ forward_mask[3];  // m4

    // apply new forward_masks tables
    for ( i = 0; i < 256; i++ ) forward_T_box_0[i] ^= forward_mask[0] ^ old_mask[0];
    for ( i = 0; i < 256; i++ ) forward_T_box_1[i] ^= forward_mask[1] ^ old_mask[1];
    for ( i = 0; i < 256; i++ ) forward_T_box_2[i] ^= forward_mask[2] ^ old_mask[2];
    for ( i = 0; i < 256; i++ ) forward_T_box_3[i] ^= forward_mask[3] ^ old_mask[3];

  }
  // if new inverse mask requested
  else if ( tables == 2 )
  { 
    // save current inverse_shift
    old_shift[0] = inverse_shift[0];
    old_shift[1] = inverse_shift[1];
    old_shift[2] = inverse_shift[2];
    old_shift[3] = inverse_shift[3];

    // set new inverse_shift
    inverse_shift[0] = (tU8) ( rnd_word( 0xFF ) );
    inverse_shift[1] = (tU8) ( rnd_word( 0xFF ) );
    inverse_shift[2] = (tU8) ( rnd_word( 0xFF ) );
    inverse_shift[3] = (tU8) ( rnd_word( 0xFF ) );

    // inverse_shift tables
    /*@+forloopexec@*/
    delta =  ( 256 - (int) old_shift[0] + (int) inverse_shift[0] ) % 256;
    for( i = 0; i < delta; i++ ) Temp_box[i] = inverse_T_box_0[i];
    for( i = 0; i < (256 - delta); i++ ) inverse_T_box_0[i] = inverse_T_box_0[i + delta];
    for( i = (256 - delta); i < 256; i++ ) inverse_T_box_0[i] = Temp_box[i - 256 + delta];

    delta =  ( 256 - (int) old_shift[1] + (int) inverse_shift[1] ) % 256;
    for( i = 0; i < delta; i++ ) Temp_box[i] = inverse_T_box_1[i];
    for( i = 0; i < (256 - delta); i++ ) inverse_T_box_1[i] = inverse_T_box_1[i + delta];
    for( i = (256 - delta); i < 256; i++ ) inverse_T_box_1[i] = Temp_box[i - 256 + delta];

    delta =  ( 256 - (int) old_shift[2] + (int) inverse_shift[2] ) % 256;
    for( i = 0; i < delta; i++ ) Temp_box[i] = inverse_T_box_2[i];
    for( i = 0; i < (256 - delta); i++ ) inverse_T_box_2[i] = inverse_T_box_2[i + delta];
    for( i = (256 - delta); i < 256; i++ ) inverse_T_box_2[i] = Temp_box[i - 256 + delta];

    delta =  ( 256 - (int) old_shift[3] + (int) inverse_shift[3] ) % 256;
    for( i = 0; i < delta; i++ ) Temp_box[i] = inverse_T_box_3[i];
    for( i = 0; i < (256 - delta); i++ ) inverse_T_box_3[i] = inverse_T_box_3[i + delta];
    for( i = (256 - delta); i < 256; i++ ) inverse_T_box_3[i] = Temp_box[i - 256 + delta];
    /*@=forloopexec@*/

    // save current inverse_masks
    old_mask[0] = inverse_mask[0];
    old_mask[1] = inverse_mask[1];
    old_mask[2] = inverse_mask[2];
    old_mask[3] = inverse_mask[3];

    // build new inverse_masks
    inverse_mask[0] = rnd_word( 0xFFFFFFFF ); // m0
    inverse_mask[1] = rnd_word( 0xFFFFFFFF ); // m1
    inverse_mask[2] = rnd_word( 0xFFFFFFFF ); // m2
    inverse_mask[3] = rnd_word( 0xFFFFFFFF ); // m3
    inverse_mask[4] = inverse_mask[0] ^ inverse_mask[1] ^ inverse_mask[2] ^ inverse_mask[3];  // m4

    // apply new inverse_masks tables
    for ( i = 0; i < 256; i++ ) inverse_T_box_0[i] ^= inverse_mask[0] ^ old_mask[0];
    for ( i = 0; i < 256; i++ ) inverse_T_box_1[i] ^= inverse_mask[1] ^ old_mask[1];
    for ( i = 0; i < 256; i++ ) inverse_T_box_2[i] ^= inverse_mask[2] ^ old_mask[2];
    for ( i = 0; i < 256; i++ ) inverse_T_box_3[i] ^= inverse_mask[3] ^ old_mask[3];
 
 } // if ( tables == .. )

}

#endif



/***************************************************************************/
/*!
   \par       FUNCTION F-AES/20: aes_encrypt_16_byte_block

   \par       DESCRIPTION

              This function encrypts an 16 byte block

   \par       EQUATION / REFERENCE

              FIPS-197 Sec.5.1

   \par       INPUT ARGUMENTS

   \param     (ctx) *aes_context         - pointer to underlying aes_context

   \param     (tU8) *plain              - 16 byte plaint text array

   \par       OUTPUT ARGUMENTS

   \param     (tU8) *cipher             - 16 byte plaint text array

   \par       RETURN VALUE

   \param     (int)  0                   - successful execution

   \param     (int) -1                   - parameter error

   \par       NOTES / WARNINGS / TODO's

   \note      Internal only!
*/
/***************************************************************************/
static void aes_encrypt_16_byte_block( const aes_context *ctx,
                                       const tU8 *plain,
                                   /*@out@*/ tU8 *cipher )
{
  
  // declarations
  tU32 *round_key;
  tU32  cx_0, cx_1, cx_2, cx_3, cy_0, cy_1, cy_2, cy_3; // state array columns

  // get encryption round keys
  round_key = (tU32 *) ctx->enc_round_keys;

  // read 16 plain text bytes into four 32-bit words (FIPS-197: state = in )
  M_BUF_TO_U32( cx_0, plain,  0 ); 
  M_BUF_TO_U32( cx_1, plain,  4 ); 
  M_BUF_TO_U32( cx_2, plain,  8 );
  M_BUF_TO_U32( cx_3, plain, 12 );

#if ( BPCL_AES_FEAT_DO_MASKING == 1 )
  
  // mask forward tables
  if ( (block_counter % BPCL_AES_FEAT_NEW_MASK_FREQ) == 0 ) aes_mask_tables( 1 );

  if ( block_counter < 255 ) block_counter++;
    else block_counter = 0;

  // mask plain text
  cx_0 ^= forward_mask[4];
  cx_1 ^= forward_mask[4];
  cx_2 ^= forward_mask[4];
  cx_3 ^= forward_mask[4];

#endif

  // XOR it with encryption round_key[0..3] (FIPS-197: AddRoundKey(state, w[0, Nb-1])
  cx_0 ^= round_key[0];
  cx_1 ^= round_key[1];
  cx_2 ^= round_key[2];
  cx_3 ^= round_key[3];

  // do encryption rounds 1..9
  AES_FORWARD_ROUND( cy_0, cy_1, cy_2, cy_3, cx_0, cx_1, cx_2, cx_3 );   // round 1 
  AES_FORWARD_ROUND( cx_0, cx_1, cx_2, cx_3, cy_0, cy_1, cy_2, cy_3 );   // round 2 
  AES_FORWARD_ROUND( cy_0, cy_1, cy_2, cy_3, cx_0, cx_1, cx_2, cx_3 );   // round 3 
  AES_FORWARD_ROUND( cx_0, cx_1, cx_2, cx_3, cy_0, cy_1, cy_2, cy_3 );   // round 4 
  AES_FORWARD_ROUND( cy_0, cy_1, cy_2, cy_3, cx_0, cx_1, cx_2, cx_3 );   // round 5 
  AES_FORWARD_ROUND( cx_0, cx_1, cx_2, cx_3, cy_0, cy_1, cy_2, cy_3 );   // round 6 
  AES_FORWARD_ROUND( cy_0, cy_1, cy_2, cy_3, cx_0, cx_1, cx_2, cx_3 );   // round 7 
  AES_FORWARD_ROUND( cx_0, cx_1, cx_2, cx_3, cy_0, cy_1, cy_2, cy_3 );   // round 8 
  AES_FORWARD_ROUND( cy_0, cy_1, cy_2, cy_3, cx_0, cx_1, cx_2, cx_3 );   // round 9 

  // encryption rounds 10..11, if key length > 16 byte
  if ( ctx->number_rounds > 10 )
  {
    AES_FORWARD_ROUND( cx_0, cx_1, cx_2, cx_3, cy_0, cy_1, cy_2, cy_3 ); // round 10 
    AES_FORWARD_ROUND( cy_0, cy_1, cy_2, cy_3, cx_0, cx_1, cx_2, cx_3 ); // round 11 
  }

  // encryption rounds 12..13, if key length > 24 byte
  if ( ctx->number_rounds > 12 )
  {
    AES_FORWARD_ROUND( cx_0, cx_1, cx_2, cx_3, cy_0, cy_1, cy_2, cy_3 ); // round 12 
    AES_FORWARD_ROUND( cy_0, cy_1, cy_2, cy_3, cx_0, cx_1, cx_2, cx_3 ); // round 13 
  }

#if ( BPCL_AES_FEAT_DO_MASKING == 1 )
  
  // demask state
  cy_0 ^= forward_mask[4];
  cy_1 ^= forward_mask[4];
  cy_2 ^= forward_mask[4];
  cy_3 ^= forward_mask[4];

#endif
  
  // final forward S-box round inluding ShiftRows() via input structure
  round_key += 4;

  cx_0 = round_key[0] ^ ( forward_S_box[ (int)( (tU8) ( cy_0 >> 24 ) ) ] << 24 ) ^
                        ( forward_S_box[ (int)( (tU8) ( cy_1 >> 16 ) ) ] << 16 ) ^
                        ( forward_S_box[ (int)( (tU8) ( cy_2 >>  8 ) ) ] <<  8 ) ^
                        ( forward_S_box[ (int)( (tU8) ( cy_3       ) ) ]       );

  cx_1 = round_key[1] ^ ( forward_S_box[ (int)( (tU8) ( cy_1 >> 24 ) ) ] << 24 ) ^
                        ( forward_S_box[ (int)( (tU8) ( cy_2 >> 16 ) ) ] << 16 ) ^
                        ( forward_S_box[ (int)( (tU8) ( cy_3 >>  8 ) ) ] <<  8 ) ^
                        ( forward_S_box[ (int)( (tU8) ( cy_0       ) ) ]       );

  cx_2 = round_key[2] ^ ( forward_S_box[ (int)( (tU8) ( cy_2 >> 24 ) ) ] << 24 ) ^
                        ( forward_S_box[ (int)( (tU8) ( cy_3 >> 16 ) ) ] << 16 ) ^
                        ( forward_S_box[ (int)( (tU8) ( cy_0 >>  8 ) ) ] <<  8 ) ^
                        ( forward_S_box[ (int)( (tU8) ( cy_1       ) ) ]       );

  cx_3 = round_key[3] ^ ( forward_S_box[ (int)( (tU8) ( cy_3 >> 24 ) ) ] << 24 ) ^
                        ( forward_S_box[ (int)( (tU8) ( cy_0 >> 16 ) ) ] << 16 ) ^
                        ( forward_S_box[ (int)( (tU8) ( cy_1 >>  8 ) ) ] <<  8 ) ^
                        ( forward_S_box[ (int)( (tU8) ( cy_2       ) ) ]       );

  // write result into cipher text byte array (FIPS 197: out = state )
  M_U32_TO_BUF( cx_0, cipher,  0 );
  M_U32_TO_BUF( cx_1, cipher,  4 );
  M_U32_TO_BUF( cx_2, cipher,  8 );
  M_U32_TO_BUF( cx_3, cipher, 12 );
}




/***************************************************************************/
/*!
   \par       FUNCTION F-AES/20: aes_decrypt_16_byte_block

   \par       DESCRIPTION

              This function decrypts an 16 byte block

   \par       EQUATION / REFERENCE

              FIPS-197 Sec.5.3.5

   \par       INPUT ARGUMENTS

   \param     (ctx) *aes_context         - pointer to underlying aes_context

   \param     (tU8) *cipher             - 16 byte plaint text array

   \par       OUTPUT ARGUMENTS

   \param     (tU8) *plain              - 16 byte plaint text array

   \par       RETURN VALUE

   \param     (int)  0                   - successful execution

   \param     (int) -1                   - parameter error

   \par       NOTES / WARNINGS / TODO's

   \note      Internal only!
*/
/***************************************************************************/
static void aes_decrypt_16_byte_block( const aes_context *ctx,
                                       const tU8 *cipher,
                                   /*@out@*/ tU8 *plain )
{

  // declarations
  tU32 *round_key;
  tU32 cx_0, cx_1, cx_2, cx_3, cy_0, cy_1, cy_2, cy_3; // state array columns

  // get decryption round keys
  round_key = (tU32 *) ctx->dec_round_keys;

  // read 16 cipher text bytes into four 32-bit words (FIPS-197: state = in )
  M_BUF_TO_U32( cx_0, cipher,  0 ); 
  M_BUF_TO_U32( cx_1, cipher,  4 );
  M_BUF_TO_U32( cx_2, cipher,  8 );
  M_BUF_TO_U32( cx_3, cipher, 12 );

#if ( BPCL_AES_FEAT_DO_MASKING == 1 )

  // mask inverse tables
  // mask forward tables
  if ( (block_counter % BPCL_AES_FEAT_NEW_MASK_FREQ) == 0 ) aes_mask_tables( 2 );

  if ( block_counter < 255 ) block_counter++;
    else block_counter = 0;
  
  // mask cipher
  cx_0 ^= inverse_mask[4];
  cx_1 ^= inverse_mask[4];
  cx_2 ^= inverse_mask[4];
  cx_3 ^= inverse_mask[4];

#endif

  // order of dec_round_key[i] invers to FIPS-197 dw[i]! (dec_round_key[i] == dw[Nr*4-i])
  // XOR it with dec_round_key[0..3] (FIPS-197: AddRoundKey(state, dw[Nr*Nb, (Nr+1)*Nb-1])
  cx_0 ^= round_key[0];
  cx_1 ^= round_key[1];
  cx_2 ^= round_key[2];
  cx_3 ^= round_key[3];

  // do decryption rounds 1..9
  AES_INVERSE_ROUND( cy_0, cy_1, cy_2, cy_3, cx_0, cx_1, cx_2, cx_3 );   // round 1 
  AES_INVERSE_ROUND( cx_0, cx_1, cx_2, cx_3, cy_0, cy_1, cy_2, cy_3 );   // round 2 
  AES_INVERSE_ROUND( cy_0, cy_1, cy_2, cy_3, cx_0, cx_1, cx_2, cx_3 );   // round 3 
  AES_INVERSE_ROUND( cx_0, cx_1, cx_2, cx_3, cy_0, cy_1, cy_2, cy_3 );   // round 4 
  AES_INVERSE_ROUND( cy_0, cy_1, cy_2, cy_3, cx_0, cx_1, cx_2, cx_3 );   // round 5 
  AES_INVERSE_ROUND( cx_0, cx_1, cx_2, cx_3, cy_0, cy_1, cy_2, cy_3 );   // round 6 
  AES_INVERSE_ROUND( cy_0, cy_1, cy_2, cy_3, cx_0, cx_1, cx_2, cx_3 );   // round 7 
  AES_INVERSE_ROUND( cx_0, cx_1, cx_2, cx_3, cy_0, cy_1, cy_2, cy_3 );   // round 8 
  AES_INVERSE_ROUND( cy_0, cy_1, cy_2, cy_3, cx_0, cx_1, cx_2, cx_3 );   // round 9 

  // encryption rounds 10..11, if key length > 16 byte
  if ( ctx->number_rounds > 10 )
  {
    AES_INVERSE_ROUND( cx_0, cx_1, cx_2, cx_3, cy_0, cy_1, cy_2, cy_3 ); // round 10 
    AES_INVERSE_ROUND( cy_0, cy_1, cy_2, cy_3, cx_0, cx_1, cx_2, cx_3 ); // round 11 
  }

  // encryption rounds 12..13, if key length > 24 byte
  if ( ctx->number_rounds > 12 )
  {
    AES_INVERSE_ROUND( cx_0, cx_1, cx_2, cx_3, cy_0, cy_1, cy_2, cy_3 ); // round 12 
    AES_INVERSE_ROUND( cy_0, cy_1, cy_2, cy_3, cx_0, cx_1, cx_2, cx_3 ); // round 13 
  }


#if ( BPCL_AES_FEAT_DO_MASKING == 1 )
  
  // demask state
  cy_0 ^= inverse_mask[4];
  cy_1 ^= inverse_mask[4];
  cy_2 ^= inverse_mask[4];
  cy_3 ^= inverse_mask[4];

#endif

  // final inverse S-box round inluding InvShiftRows() via input structure
  round_key += 4;

  cx_0 = round_key[0] ^ ( inverse_S_box[ (int)( (tU8) ( cy_0 >> 24 ) ) ] << 24 ) ^
                        ( inverse_S_box[ (int)( (tU8) ( cy_3 >> 16 ) ) ] << 16 ) ^
                        ( inverse_S_box[ (int)( (tU8) ( cy_2 >>  8 ) ) ] <<  8 ) ^
                        ( inverse_S_box[ (int)( (tU8) ( cy_1       ) ) ]       );

  cx_1 = round_key[1] ^ ( inverse_S_box[ (int)( (tU8) ( cy_1 >> 24 ) ) ] << 24 ) ^
                        ( inverse_S_box[ (int)( (tU8) ( cy_0 >> 16 ) ) ] << 16 ) ^
                        ( inverse_S_box[ (int)( (tU8) ( cy_3 >>  8 ) ) ] <<  8 ) ^
                        ( inverse_S_box[ (int)( (tU8) ( cy_2       ) ) ]       );

  cx_2 = round_key[2] ^ ( inverse_S_box[ (int)( (tU8) ( cy_2 >> 24 ) ) ] << 24 ) ^
                        ( inverse_S_box[ (int)( (tU8) ( cy_1 >> 16 ) ) ] << 16 ) ^
                        ( inverse_S_box[ (int)( (tU8) ( cy_0 >>  8 ) ) ] <<  8 ) ^
                        ( inverse_S_box[ (int)( (tU8) ( cy_3       ) ) ]       );

  cx_3 = round_key[3] ^ ( inverse_S_box[ (int)( (tU8) ( cy_3 >> 24 ) ) ] << 24 ) ^
                        ( inverse_S_box[ (int)( (tU8) ( cy_2 >> 16 ) ) ] << 16 ) ^
                        ( inverse_S_box[ (int)( (tU8) ( cy_1 >>  8 ) ) ] <<  8 ) ^
                        ( inverse_S_box[ (int)( (tU8) ( cy_0       ) ) ]       );

  // write result into plain text byte array
  M_U32_TO_BUF( cx_0, plain,  0 );
  M_U32_TO_BUF( cx_1, plain,  4 );
  M_U32_TO_BUF( cx_2, plain,  8 );
  M_U32_TO_BUF( cx_3, plain, 12 );
}


/***************************************************************************/
/*!
   \par       FUNCTION F-AES/40: aes_encrypt

   \par       DESCRIPTION

              This function encrypts a byte array in various AES modes

   \par       EQUATION / REFERENCE

              FIPS-197

   \par       INPUT ARGUMENTS

   \param     const (int) aes_mode       - AES mode (ECB, CBC, CFB, OFB, CTR)

   \param     const (tU8) *aes_key      - pointer to key bytes array

   \param     const (tU32) key_length   - key length in bytes

   \param     const (tU8) IV            - pointer to 128-bit initialization vector (expects a 16-byte array!)

   \param     const (tU32) IV_length    - IV length in bytes (expect 16-byte array!)

   \param     (tU8) *plain_data         - pointer to plain data bytes array

   \param     const (tU32) data_length  - length of data in bytes (has to be multiple of 16 bytes!)

   \par       OUTPUT ARGUMENTS

   \param     (tU8) *cipher_data        - pointer to cipher data bytes array

   \par       RETURN VALUE

   \param     (int)  0                   - successful execution

   \param     (int) -1                   - parameter error

   \param     (int) -2                   - unknown AES mode

   \par       NOTES / WARNINGS / TODO's

   \note      data_length has to be multiple of 16 bytes!
*/
/***************************************************************************/
extern int aes_encrypt( const int    aes_mode,
                        const tU8  *aes_key,
                        const tU32  aes_key_length,
                        const tU8  *IV,
                        const tU32  IV_length,
                        tU8        *plain_data,
                        tU8        *cipher_data,
                        const tU32  data_length )
{

  // declarations
  aes_context aes_ctx;
  tU8 aes_buf[16];
  tU8 aes_ctr[16];

  tU8 mask, pos_mask, first_bit, carry_over_bit;
 
  tU32 num_blocks;

  tU8 *p_plain;
  tU8 *p_cipher;

  tU32 i,j;

  int ret;
  (tVoid)IV_length;
  // init encryption keys
  ret = aes_set_key( &aes_ctx, aes_key, aes_key_length, IV );
  (tVoid)ret;

  // get number of blocks and padding
  num_blocks = (tU32) ( data_length / 16 );

  // get pointers
  p_plain  = plain_data;
  p_cipher = cipher_data;

  // --- Electronic Codebook Mode (ECB) -------------------------------------
  if ( aes_mode == BPCL_AES_MODE_ECB )
  {
    // encrypt all 16 byte blocks
    for ( i = 0; i < num_blocks; i++ )
    {
      // fill buffer with plain
      for ( j = 0; j < 16; j++ ) aes_buf[j] = p_plain[j];

      // encrypt buffer
      aes_encrypt_16_byte_block( &aes_ctx, aes_buf, aes_buf );
      
      // write cipher
      for ( j = 0; j < 16; j++ ) p_cipher[j] = aes_buf[j];
      
      // next block
      p_plain  += 16;
      p_cipher += 16;
    }
  }
  // --- Cipher Block Chaining Mode (CBC) -----------------------------------
  else if ( aes_mode == BPCL_AES_MODE_CBC )
  {
    // fill buffer with IV
    for ( i = 0; i < 16; i++ ) aes_buf[i] = aes_ctx.IV[i];
    
    // encrypt all 16 byte blocks
    for ( i = 0; i < num_blocks; i++ )
    {
      // XOR plain with previous cipher
      for ( j = 0; j < 16; j++ ) aes_buf[j] = (tU8) (p_plain[j] ^ aes_buf[j]);

      // encrypt buffer
      aes_encrypt_16_byte_block( &aes_ctx, aes_buf, aes_buf );

      // write cipher
      for ( j = 0; j < 16; j++ ) p_cipher[j] = aes_buf[j];
      
      // next block
      p_plain  += 16;
      p_cipher += 16;
    }
  }
  // --- Cipher FeedBack Mode (CFB) -----------------------------------------
  else if ( aes_mode == BPCL_AES_MODE_CFB )
  {
    // fill buffer with IV
    for ( i = 0; i < 16; i++ ) aes_buf[i] = aes_ctx.IV[i];
    
    // encrypt all 16 byte blocks
    for ( i = 0; i < num_blocks; i++ )
    {
      // encrypt buffer
      aes_encrypt_16_byte_block( &aes_ctx, aes_buf, aes_buf );

      // XOR plain with encrypted buffer
      for ( j = 0; j < 16; j++ ) aes_buf[j] = (tU8) (p_plain[j] ^ aes_buf[j]);

      // write cipher
      for ( j = 0; j < 16; j++ ) p_cipher[j] = aes_buf[j];
      
      // next block
      p_plain  += 16;
      p_cipher += 16;
    }
  }
  // --- Output FeedBack Mode (OFB) -------------------------------------------------
  else if ( aes_mode == BPCL_AES_MODE_OFB )
  {
    // fill buffer with IV
    for ( i = 0; i < 16; i++ ) aes_buf[i] = aes_ctx.IV[i];
    
    // encrypt all 16 byte blocks
    for ( i = 0; i < num_blocks; i++ )
    {
      // encrypt buffer
      aes_encrypt_16_byte_block( &aes_ctx, aes_buf, aes_buf );

      // XOR plain with encrypted buffer
      for ( j = 0; j < 16; j++ ) p_cipher[j] = (tU8) (p_plain[j] ^ aes_buf[j]);

      // next block
      p_plain  += 16;
      p_cipher += 16;
    }
  }
  // --- Counter Mode (CTR) -------------------------------------------------
  else if ( aes_mode == BPCL_AES_MODE_CTR )
  {
    // init aes_ctr with IV
    for ( i = 0; i < 16; i++ ) aes_ctr[i] = aes_ctx.IV[i];

    // do n-times shifts of IV before start
    for ( i = 0; i < 50; i++ )
    {
      first_bit = (tU8) 0;

      // check lower portion of polynomial
      /*@-usedef@*/
      mask = (tU8) ( aes_ctr[15] & C_LFSR_LOW_ORDER_PORTION_POLYNOMIAL);
      /*@=usedef@*/
      
      for( pos_mask = (tU8) 0x80; pos_mask != (tU8) 0; pos_mask >>= 1 )
      {
        if ( (pos_mask & mask) != (tU8) 0 ) first_bit = (tU8) (first_bit == (tU8) 0); // toggle first_bit (first_bit = !first_bit)
      }
      
      // check high portion of polynomial (x^128)
      /*@-usedef@*/
      mask = (tU8) (aes_ctr[0] & C_LFSR_HIGH_ORDER_PORTION_POLYNOMIAL);
      /*@=usedef@*/
      for( pos_mask = (tU8) 0x80; pos_mask != (tU8) 0; pos_mask >>= 1 )
      {
        if ( (pos_mask & mask) != (tU8) 0 ) first_bit = (tU8) (first_bit == (tU8) 0); // toggle first_bit
      }

      // left shift
      for ( j = 0; j < 16; j++ )
      {
        carry_over_bit = (tU8) (aes_ctr[j] & 0x80);
        aes_ctr[j] = (tU8) ( (aes_ctr[j] << 1) | first_bit);
        first_bit = carry_over_bit;
      }
    }
    
    // decrypt all 16 byte blocks
    for ( i = 0; i < num_blocks; i++ )
    {
      // encrypt ctr to buffer
      aes_encrypt_16_byte_block( &aes_ctx, aes_ctr, aes_buf );

      // XOR encrypted ctr with plain
      for ( j = 0; j < 16; j++ ) p_cipher[j] = (tU8) (aes_buf[j] ^ p_plain[j]);

      // increase counter (but we do LFSR)
      // aes_ctr++
      
      first_bit = (tU8) 0;

      // check lower portion of polynomial
      mask = (tU8) ( aes_ctr[15] & C_LFSR_LOW_ORDER_PORTION_POLYNOMIAL);
      for( pos_mask = (tU8) 0x80; pos_mask != (tU8) 0; pos_mask >>= 1 )
      {
        if ( (pos_mask & mask) != (tU8) 0 )  first_bit = (tU8) (first_bit == (tU8) 0);
      }
      
      // check high portion of polynomial (x^128)
      mask = (tU8) (aes_ctr[0] & C_LFSR_HIGH_ORDER_PORTION_POLYNOMIAL);
      for( pos_mask = (tU8) 0x80; pos_mask != (tU8) 0; pos_mask >>= 1 )
      {
        if ( (pos_mask & mask) != (tU8) 0 ) first_bit = (tU8) (first_bit == (tU8) 0);
      }

      // left shift
      for ( j = 0; j < 16; j++ )
      {
        carry_over_bit = (tU8) (aes_ctr[j] & 0x80);
        aes_ctr[j] = (tU8) ( (aes_ctr[j] << 1) | first_bit);
        first_bit = carry_over_bit;
      }

      // next block
      p_plain  += 16;
      p_cipher += 16;
    }
  }
  // --- Unknown Mode -------------------------------------------------------
  else
  {
     return -2;
  }
 
  // successful
  return 0;

}


/***************************************************************************/
/*!
   \par       FUNCTION F-AES/50: aes_decrypt

   \par       DESCRIPTION

              This function decrypts a byte array in various AES modes

   \par       EQUATION / REFERENCE

              FIPS-197

   \par       INPUT ARGUMENTS

   \param     const (int) aes_mode       - AES mode (ECB, CBC, CFB, OFB, CTR)

   \param     const (tU8) *aes_key      - pointer to key bytes array

   \param     const (tU32) key_length   - key length in bytes

   \param     const (tU8) *IV           - pointer to 128-bit initialization vector 

   \param     const (tU32) IV_length    - IV length in bytes (expect 16-byte array!)

   \param     (tU8) *cipher_data        - pointer to cipher data bytes array

   \param     const (tU32) data_length  - length of data in bytes (has to be multiple of 16 bytes!)

   \par       OUTPUT ARGUMENTS

   \param     (tU8) *plain_data         - pointer to plain data bytes array

   \par       RETURN VALUE

   \param     (int)  0                   - successful execution

   \param     (int) -1                   - parameter error

   \param     (int) -2                   - unknown AES mode

   \par       NOTES / WARNINGS / TODO's

   \note      data_length has to be multiple of 16 bytes!
*/
/***************************************************************************/
extern int aes_decrypt( const int    aes_mode,
                        const tU8  *aes_key,
                        const tU32  aes_key_length,
                        const tU8  *IV,
                        const tU32  IV_length,
                        tU8        *plain_data,
                        tU8        *cipher_data,
                        const tU32  data_length )
{

  // declarations
  aes_context aes_ctx;

  tU8 aes_buf1[16];
  tU8 aes_buf2[16];
  
  tU8 mask, pos_mask, first_bit, carry_over_bit;
 
  tU32 num_blocks;

  tU8 *p_plain;
  tU8 *p_cipher;

  tU32 i,j;

  int ret;

  (tVoid)IV_length;
  // init encryption keys
  ret = aes_set_key( &aes_ctx, aes_key, aes_key_length, IV );
 (tVoid)ret;

  // get number of blocks and padding
  num_blocks = (tU32) ( data_length / 16 );

  // get pointers
  p_plain  = plain_data;
  p_cipher = cipher_data;

  // --- Electronic Codebook Mode (ECB) -------------------------------------
  if ( aes_mode == BPCL_AES_MODE_ECB )
  {
    
    // decrypt 16 byte blocks
    for ( i = 0; i < num_blocks; i++ )
    {
      // fill buffer 1 with cipher
      for ( j = 0; j < 16; j++ ) aes_buf1[j] = p_cipher[j];
      
      // decrypt buffer 1
      aes_decrypt_16_byte_block( &aes_ctx, aes_buf1, aes_buf1 );
      
      // write buffer 1 to plain
      for ( j = 0; j < 16; j++ ) p_plain[j] = aes_buf1[j];
      
      // next block
      p_plain  += 16;
      p_cipher += 16;
    }

  }
  // --- Cipher Block Chaining Mode (CBC) -----------------------------------
  else if ( aes_mode == BPCL_AES_MODE_CBC )
  {
    // fill buffer 2 with IV
    for ( i = 0; i < 16; i++ ) aes_buf2[i] = aes_ctx.IV[i];
    
    // decrypt all 16 byte blocks
    for ( i = 0; i < num_blocks; i++ )
    {
      // fill buffer 1 with cipher
      for ( j = 0; j < 16; j++ ) aes_buf1[j] = p_cipher[j];

      // decrypt buffer 1
      aes_decrypt_16_byte_block( &aes_ctx, aes_buf1, aes_buf1 );

      // XOR buffer 1 with previous cipher in buffer 2
      for ( j = 0; j < 16; j++ ) aes_buf1[j] = (tU8) (aes_buf1[j] ^ aes_buf2[j]);

      // fill buffer 2 with previous cipher
      for ( j = 0; j < 16; j++ ) aes_buf2[j] = p_cipher[j];

      // write plain_data
      for ( j = 0; j < 16; j++ ) p_plain[j] = aes_buf1[j];

      // next block
      p_plain  += 16;
      p_cipher += 16;
    }
  }
  // --- Cipher FeedBack Mode (CFB) -----------------------------------------
  else if ( aes_mode == BPCL_AES_MODE_CFB )
  {
    // fill buffer 2 with IV
    for ( i = 0; i < 16; i++ ) aes_buf2[i] = aes_ctx.IV[i];
    
    // decrypt all 16 byte blocks
    for ( i = 0; i < num_blocks; i++ )
    {
      // encrypt buffer 2
      aes_encrypt_16_byte_block( &aes_ctx, aes_buf2, aes_buf2 );
      
      // XOR buffer 2 with cipher
      for ( j = 0; j < 16; j++ ) aes_buf1[j] = (tU8) (aes_buf2[j] ^ p_cipher[j]);

      // fill buffer 2 with old cipher
      for ( j = 0; j < 16; j++ ) aes_buf2[j] = p_cipher[j];

      // write plain_data
      for ( j = 0; j < 16; j++ ) p_plain[j] = aes_buf1[j];

      // next block
      p_plain  += 16;
      p_cipher += 16;
    }
  }
  // --- Output FeedBack Mode (OFB) -------------------------------------------------
  else if ( aes_mode == BPCL_AES_MODE_OFB )
  {
    // fill buffer 1 with IV
    for ( i = 0; i < 16; i++ ) aes_buf1[i] = aes_ctx.IV[i];
    
    // decrypt all 16 byte blocks
    for ( i = 0; i < num_blocks; i++ )
    {
      // encrypt buffer 2
      aes_encrypt_16_byte_block( &aes_ctx, aes_buf1, aes_buf1 );

      // XOR buffer 2 with cipher
      for ( j = 0; j < 16; j++ ) p_plain[j] = (tU8) (aes_buf1[j] ^ p_cipher[j]);

      // next block
      p_plain  += 16;
      p_cipher += 16;
    }
  }
  // --- Counter Mode (CTR) -------------------------------------------------
  else if ( aes_mode == BPCL_AES_MODE_CTR )
  {
    // init counter buffer 1 with IV
    for ( i = 0; i < 16; i++ ) aes_buf1[i] = aes_ctx.IV[i];

    // do n-times shifts of IV before start
    for ( i = 0; i < 50; i++ )
    {
      first_bit = (tU8) 0;

      // check lower portion of polynomial
      /*@-usedef@*/
      mask = (tU8) ( aes_buf1[15] & C_LFSR_LOW_ORDER_PORTION_POLYNOMIAL);
      /*@=usedef@*/
      for( pos_mask = (tU8) 0x80; pos_mask != (tU8) 0; pos_mask >>= 1 )
      {
        if ( (pos_mask & mask) != (tU8) 0 )  first_bit = (tU8) (first_bit == (tU8) 0);
      }
      
      // check high portion of polynomial (x^128)
      /*@-usedef@*/
      mask = (tU8) (aes_buf1[0] & C_LFSR_HIGH_ORDER_PORTION_POLYNOMIAL);
      /*@=usedef@*/
      for( pos_mask = (tU8) 0x80; pos_mask != (tU8) 0; pos_mask >>= 1 )
      {
        if ( (pos_mask & mask) != (tU8) 0 ) first_bit = (tU8) (first_bit == (tU8) 0);
      }

      // left shift
      for ( j = 0; j < 16; j++ )
      {
        carry_over_bit = (tU8) (aes_buf1[j] & 0x80);
        aes_buf1[j] = (tU8) ( (aes_buf1[j] << 1) | first_bit);
        first_bit = carry_over_bit;
      }
    }
    
    // decrypt all 16 byte blocks
    for ( i = 0; i < num_blocks; i++ )
    {
      // encrypt counter buffer 1 to buffer 2
      aes_encrypt_16_byte_block( &aes_ctx, aes_buf1, aes_buf2 );

      // XOR encrypted buffer 2 with cipher
      for ( j = 0; j < 16; j++ ) p_plain[j] = (tU8) (aes_buf2[j] ^ p_cipher[j]);

      // increase counter buffer 1
      first_bit = (tU8) 0;

      // check lower portion of polynomial
      mask = (tU8) ( aes_buf1[15] & C_LFSR_LOW_ORDER_PORTION_POLYNOMIAL);
      for( pos_mask = (tU8) 0x80; pos_mask != (tU8) 0; pos_mask >>= 1 )
      {
        if ( (pos_mask & mask) != (tU8) 0 )  first_bit = (tU8) (first_bit == (tU8) 0);
      }
      
      // check high portion of polynomial (x^128)
      mask = (tU8) (aes_buf1[0] & C_LFSR_HIGH_ORDER_PORTION_POLYNOMIAL);
      for( pos_mask = (tU8) 0x80; pos_mask != (tU8) 0; pos_mask >>= 1 )
      {
        if ( (pos_mask & mask) != (tU8) 0 ) first_bit = (tU8) (first_bit == (tU8) 0);
      }

      // left shift
      for ( j = 0; j < 16; j++ )
      {
        carry_over_bit = (tU8) (aes_buf1[j] & 0x80);
        aes_buf1[j] = (tU8) ( (aes_buf1[j] << 1) | first_bit);
        first_bit = carry_over_bit;
      }

      // next block
      p_plain  += 16;
      p_cipher += 16;
    }
  }
  // --- Unknown Mode -------------------------------------------------------
  else
  {
     return -2;
  }
 
  // successful
  return 0;
}

tErrCode BPCL_AES_Encrypt(
	tU8				mode,		// BPCL_AES_MODE_*
	tU8				key_mode,	// BPCL_AES_OP_*
	tU8				key_size,	// BPCL_AES_KEYSIZE_*
	tU8				*p_key,		// Buffer holding key
	tU8				key_idx,	// Index of internal key to be used
	tU8				*p_iv,		// Buffer holding 128-bit initialization vector
	tU8				*p_plain,	// Buffer holding plain data
	tU8				*p_cipher,	// Buffer to carry encrypted data (providing
								// NULL or p_plain replaces plain data)
	tU32			data_len	// size of p_plain in bytes
) {
	(tVoid)key_mode;
	(tVoid)key_idx;
	return (tErrCode)aes_encrypt(mode,p_key,(key_size+1)*8,p_iv,16,p_plain,p_cipher,data_len);
}

tErrCode BPCL_AES_Decrypt(
	tU8				mode,		// BPCL_AES_MODE_*
	tU8				key_mode,	// BPCL_AES_OP_*
	tU8				key_size,	// BPCL_AES_KEYSIZE_*
	tU8				*p_key,		// Buffer holding key
	tU8				key_idx,	// Index of internal key to be used
	tU8				*p_iv,		// Buffer holding 128-bit initialization vector
	tU8				*p_cipher,	// Buffer holding encrypted data
	tU8				*p_plain,	// Buffer to carry plain data (providing
								// NULL or p_plain replaces encrypted data)
	tU32			data_len	// size of p_plain in bytes
) {
	(tVoid)key_mode;
	(tVoid)key_idx;
	return (tErrCode)aes_decrypt(mode,p_key,(key_size+1)*8,p_iv,16,p_plain,p_cipher,data_len);
}

#ifdef _BPCL_TEST

//-----------------------------------------------------------------------------
//	Test implementation
//-----------------------------------------------------------------------------

tS32 BPCL_AES_Test(tU8 test_type) {

	tU32	i;
	tU8 	res1[16], res2[16];
	tU8		buffer[16384] = { 0L };	/* 16kB for performance tests */
	tErrCode  rc = BPCL_OK;


	struct {
		tU8  key[16];
		tU8  plain[16];
		tU8  cipher[16];
	} test_data[2] = {
		{
			// FIPS-197 Test-Vector AES-128
			{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
			  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
			{ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
			  0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
			{ 0x69, 0xc4, 0xe0, 0xd8, 0x6a, 0x7b, 0x04, 0x30,
			  0xd8, 0xcd, 0xb7, 0x80, 0x70, 0xb4, 0xc5, 0x5a }
		}, {
			// FIPS-197 Test-Vector AES-128
			{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
			  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f },
			{ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
			  0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff },
			{ 0x69, 0xc4, 0xe0, 0xd8, 0x6a, 0x7b, 0x04, 0x30,
			  0xd8, 0xcd, 0xb7, 0x80, 0x70, 0xb4, 0xc5, 0x5a }
		}
	};

	printf("\n----------------------------------------------------------\n");
	printf("             AES Test\n");
	printf("----------------------------------------------------------\n");

	if(test_type & BPCL_TEST_COMPLIANCE) {
		printf("\nAES Compliance Test\n");
		for(i=0; i<2; ++i) {
			M_MASK_KEY(test_data[i].key, 16)
			BPCL_AES_Encrypt(BPCL_AES_MODE_ECB, BPCL_AES_OP_ARGKEY, BPCL_AES_KEYSIZE_128,
				             test_data[i].key, 0, (tU8*)0, test_data[i].plain, res1, 16);
			BPCL_AES_Decrypt(BPCL_AES_MODE_ECB, BPCL_AES_OP_ARGKEY, BPCL_AES_KEYSIZE_128,
				             test_data[i].key, 0, (tU8*)0, res1, res2, 16);
			if( (res1[0] != test_data[i].cipher[0] || res1[1] != test_data[i].cipher[1]) ||
				(res2[0] != test_data[i].plain[0]  || res2[1] != test_data[i].plain[1]) ) {
				printf("AES Compliance Test #%d failed:\n", i);
				rc = BPCL_ERR_TEST_FAILED;
			} else {
				printf("AES Compliance Test #%d okay\n", i);
			}
		}
		printf("\nAES Compliance Test DONE\n");
	}
	
	if(test_type & BPCL_TEST_PERFORMANCE) {

		printf("\nAES Performance Test\n");

		BPCL_TimerStart();
		for(i = 0; i < 640; ++i) {
			/* Loop over 10MB of data (640 * 16kB) */
			BPCL_AES_Encrypt(BPCL_AES_MODE_ECB, BPCL_AES_OP_ARGKEY, BPCL_AES_KEYSIZE_128,
				             test_data[0].key, 0, (tU8*)0, buffer, buffer, 16384);
		}
		BPCL_TimerReport("AES-128 Encryption ECB@10MB");

		BPCL_TimerStart();
		for(i = 0; i < 640; ++i) {
			/* Loop over 10MB of data (640 * 16kB) */
			BPCL_AES_Decrypt(BPCL_AES_MODE_ECB, BPCL_AES_OP_ARGKEY, BPCL_AES_KEYSIZE_128,
				             test_data[0].key, 0, (tU8*)0, buffer, buffer, 16384);
		}
		BPCL_TimerReport("AES-128 Decryption ECB@10MB");

		BPCL_TimerStart();
		for(i = 0; i < 640; ++i) {
			/* Loop over 10MB of data (640 * 16kB) */
			BPCL_AES_Encrypt(BPCL_AES_MODE_CTR, BPCL_AES_OP_ARGKEY, BPCL_AES_KEYSIZE_128,
				             test_data[0].key, 0, buffer, buffer, buffer, 16384);
		}
		BPCL_TimerReport("AES-128 Encryption CTR@10MB");

		BPCL_TimerStart();
		for(i = 0; i < 640; ++i) {
			/* Loop over 10MB of data (640 * 16kB) */
			BPCL_AES_Decrypt(BPCL_AES_MODE_CTR, BPCL_AES_OP_ARGKEY, BPCL_AES_KEYSIZE_128,
				             test_data[0].key, 0, buffer, buffer, buffer, 16384);
		}
		BPCL_TimerReport("AES-128 Decryption CTR@10MB");

		BPCL_TimerStart();
		for(i = 0; i < 640; ++i) {
			/* Loop over 10MB of data (640 * 16kB) */
			BPCL_AES_Encrypt(BPCL_AES_MODE_CBC, BPCL_AES_OP_ARGKEY, BPCL_AES_KEYSIZE_256,
				             buffer, 0, buffer, buffer, buffer, 16384);
		}
		BPCL_TimerReport("AES-256 Encryption CBC@10MB");

		BPCL_TimerStart();
		for(i = 0; i < 640; ++i) {
			/* Loop over 10MB of data (640 * 16kB) */
			BPCL_AES_Decrypt(BPCL_AES_MODE_CBC, BPCL_AES_OP_ARGKEY, BPCL_AES_KEYSIZE_256,
				             buffer, 0, buffer, buffer, buffer, 16384);
		}
		BPCL_TimerReport("AES-256 Decryption CBC@10MB");

		printf("\nAES Performance Test DONE\n");
	}
	return rc;
} /* BPCL_AES_Test() */

#endif /* _BPCL_TEST */
